[llvm-branch-commits] [llvm] [AMDGPU][PromoteAlloca] Set !amdgpu.non.volatile if promotion fails (PR #179415)

Thu Feb 5 04:54:54 PST 2026

https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/179415

>From df38810d00003769f3b2257fd2cfe5fe5f4edd60 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 2 Feb 2026 14:10:51 +0100
Subject: [PATCH 1/3] [AMDGPU][PromoteAlloca] Set !amdgpu.non.volatile if
 promotion fails

I thought about doing this in a separate pass, but this pass already has all the necessary analysis for this to be a trivial addition.
We can simply set `!amdgpu.non.volatile`  if all other attempts to promote the operation failed.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 31 ++++++++++++-
 .../CodeGen/AMDGPU/promote-alloca-memset.ll   |  4 +-
 .../promote-alloca-non-volatile-accesses.ll   | 45 +++++++++++++++++++
 .../AMDGPU/promote-alloca-vgpr-ratio.ll       | 41 +++++++++--------
 4 files changed, 99 insertions(+), 22 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-non-volatile-accesses.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index ed676c3fde2f8..7da612d8aa5d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -120,6 +120,11 @@ struct AllocaAnalysis {
   } LDS;
 
   explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
+
+  void eraseAlloca() {
+    Alloca->eraseFromParent();
+    Alloca = nullptr;
+  }
 };
 
 // Shared implementation which can do both promotion to vector and to LDS.
@@ -152,6 +157,10 @@ class AMDGPUPromoteAllocaImpl {
   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
                                        Instruction *UseInst, int OpIdx0,
                                        int OpIdx1) const;
+  /// Set the amdgpu.non.volatile metadata on all load/store users of \p AA.
+  /// This assumes the pointer of the alloca never escapes, and thus the memory
+  /// is thread-local.
+  void setNonVolatileMetadata(AllocaAnalysis &AA);
 
   /// Check whether we have enough local memory for promotion.
   bool hasSufficientLocalMem(const Function &F);
@@ -443,6 +452,15 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
     if (AA.LDS.Enable &&
         tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
       Changed = true;
+
+    // If we were unable to remove this alloca, mark all accesses to it as
+    // non-volatile instead. This pass rejects all allocas whose pointer escape,
+    // so the memory of the alloca is known to never be written to outside this
+    // thread.
+    if (AA.Alloca) {
+      setNonVolatileMetadata(AA);
+      Changed = true;
+    }
   }
   finishDeferredAllocaToLDSPromotion(DeferredIntrs);
 
@@ -1160,7 +1178,7 @@ void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
 
   // Alloca should now be dead too.
   assert(AA.Alloca->use_empty());
-  AA.Alloca->eraseFromParent();
+  AA.eraseAlloca();
 }
 
 std::pair<Value *, Value *>
@@ -1432,6 +1450,15 @@ void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
   AA.LDS.Enable = true;
 }
 
+void AMDGPUPromoteAllocaImpl::setNonVolatileMetadata(AllocaAnalysis &AA) {
+  for (Use *U : AA.Uses) {
+    Instruction *I = dyn_cast<Instruction>(U->getUser());
+    if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+      I->setMetadata("amdgpu.non.volatile", MDNode::get(I->getContext(), {}));
+    }
+  }
+}
+
 bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
 
   FunctionType *FTy = F.getFunctionType();
@@ -1629,7 +1656,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
   AA.Alloca->mutateType(Offset->getType());
   AA.Alloca->replaceAllUsesWith(Offset);
-  AA.Alloca->eraseFromParent();
+  AA.eraseAlloca();
 
   PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
index 81d6dba494cfc..60526551b22fc 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -44,7 +44,7 @@ define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
 ; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8, !amdgpu.non.volatile [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -59,7 +59,7 @@ define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
 ; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8, !amdgpu.non.volatile [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-non-volatile-accesses.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-volatile-accesses.ll
new file mode 100644
index 0000000000000..b912e2199bac1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-non-volatile-accesses.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+; Verify that the !amdgpu.non.volatile metadata is set if promoting an alloca fails.
+
+define amdgpu_kernel void @test(i64 %val, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @test(
+; CHECK-SAME: i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    [[STACK_1:%.*]] = getelementptr inbounds i64, ptr addrspace(5) [[STACK]], i32 1
+; CHECK-NEXT:    store i64 43, ptr addrspace(5) [[STACK]], align 8, !amdgpu.non.volatile [[META0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[LOOP:.*]], label %[[END:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PSTACK:%.*]] = phi ptr addrspace(5) [ [[STACK]], %[[ENTRY]] ], [ [[STACK_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr addrspace(5) [[PSTACK]], align 8, !amdgpu.non.volatile [[META0]]
+; CHECK-NEXT:    store i64 32, ptr addrspace(5) [[STACK_1]], align 8, !amdgpu.non.volatile [[META0]]
+; CHECK-NEXT:    [[LOOP_CC:%.*]] = icmp ne i64 [[LOAD]], 32
+; CHECK-NEXT:    br i1 [[LOOP_CC]], label %[[LOOP]], label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[RELOAD:%.*]] = load i64, ptr addrspace(5) [[STACK]], align 8, !amdgpu.non.volatile [[META0]]
+; CHECK-NEXT:    [[RELOAD_1:%.*]] = load i64, ptr addrspace(5) [[STACK_1]], align 8, !amdgpu.non.volatile [[META0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  %stack.1 = getelementptr inbounds i64, ptr addrspace(5) %stack, i32 1
+  store i64 43, ptr addrspace(5) %stack
+  br i1 %cond, label %loop, label %end
+
+loop:
+  %pstack = phi ptr addrspace(5) [%stack, %entry], [%stack.1, %loop]
+  %load = load i64, ptr addrspace(5) %pstack
+  store i64 32, ptr addrspace(5) %stack.1
+  %loop.cc = icmp ne i64 %load, 32
+  br i1 %loop.cc, label %loop, label %end
+
+end:
+  %reload = load i64, ptr addrspace(5) %stack
+  %reload.1 = load i64, ptr addrspace(5) %stack.1
+  ret void
+}
+;.
+; CHECK: [[META0]] = !{}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
index 4bcc46861d66b..74c4704d716b1 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vgpr-ratio.ll
@@ -16,10 +16,10 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
 ; DEFAULT-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
 ; DEFAULT-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
 ; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
-; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0:![0-9]+]]
+; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]]
 ; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]]
 ; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
 ; DEFAULT-NEXT:    ret void
 ;
@@ -48,10 +48,10 @@ define amdgpu_kernel void @i32_24_elements(ptr %out) #0 {
 ; RATIO8-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
 ; RATIO8-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
 ; RATIO8-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
-; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0:![0-9]+]]
+; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]]
 ; RATIO8-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]]
 ; RATIO8-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
 ; RATIO8-NEXT:    ret void
 ;
@@ -112,10 +112,10 @@ define amdgpu_kernel void @i32_24_elements_attrib(ptr %out) #1 {
 ; RATIO8-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 96, i1 false)
 ; RATIO8-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
 ; RATIO8-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 20
-; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0]]
+; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]]
 ; RATIO8-NEXT:    [[GEP:%.*]] = getelementptr inbounds [24 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]]
 ; RATIO8-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
 ; RATIO8-NEXT:    ret void
 ;
@@ -176,10 +176,10 @@ define amdgpu_kernel void @i32_16_elements(ptr %out) #0 {
 ; RATIO8-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false)
 ; RATIO8-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
 ; RATIO8-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15
-; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0]]
+; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]]
 ; RATIO8-NEXT:    [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]]
 ; RATIO8-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
 ; RATIO8-NEXT:    ret void
 ;
@@ -214,10 +214,10 @@ define amdgpu_kernel void @i32_16_elements_attrib(ptr %out) #2 {
 ; DEFAULT-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false)
 ; DEFAULT-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
 ; DEFAULT-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15
-; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; DEFAULT-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0]]
+; DEFAULT-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]]
 ; DEFAULT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; DEFAULT-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]]
 ; DEFAULT-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
 ; DEFAULT-NEXT:    ret void
 ;
@@ -246,10 +246,10 @@ define amdgpu_kernel void @i32_16_elements_attrib(ptr %out) #2 {
 ; RATIO8-NEXT:    call void @llvm.memset.p5.i32(ptr addrspace(5) [[ALLOCA]], i8 0, i32 64, i1 false)
 ; RATIO8-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0
 ; RATIO8-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 15
-; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4
-; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4
+; RATIO8-NEXT:    store i32 42, ptr addrspace(5) [[GEP_0]], align 4, !amdgpu.non.volatile [[META0]]
+; RATIO8-NEXT:    store i32 43, ptr addrspace(5) [[GEP_1]], align 4, !amdgpu.non.volatile [[META0]]
 ; RATIO8-NEXT:    [[GEP:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[SEL2]]
-; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4
+; RATIO8-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[GEP]], align 4, !amdgpu.non.volatile [[META0]]
 ; RATIO8-NEXT:    store i32 [[LOAD]], ptr [[OUT]], align 4
 ; RATIO8-NEXT:    ret void
 ;
@@ -278,5 +278,10 @@ declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture writeonly, i8, i32,
 attributes #0 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" }
 attributes #1 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="2" }
 attributes #2 = { nounwind "amdgpu-promote-alloca-to-vector-max-regs"="24" "amdgpu-waves-per-eu"="4,4" "amdgpu-promote-alloca-to-vector-vgpr-ratio"="8" }
+;.
+; DEFAULT: [[META0]] = !{}
+;.
+; RATIO8: [[META0]] = !{}
+;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; BASE: {{.*}}

>From 28c76cf412be76fc383bc4eed36c76f6aba1ed1d Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 3 Feb 2026 14:17:52 +0100
Subject: [PATCH 2/3] Pull metadata impl at the top of the patch stack

---
 llvm/docs/AMDGPUUsage.rst                     |  23 ++
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   2 +
 .../AMDGPU/memory-legalizer-non-volatile.ll   | 218 ++++++++++++++++++
 3 files changed, 243 insertions(+)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index cd5410a31b98f..d3717dcc00908 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1878,6 +1878,29 @@ and
 
   !0 = !{}
 
+.. _amdgpu_non_volatile:
+
+'``amdgpu.non.volatile``' Metadata
+----------------------------------
+
+Explicitly marks memory accesses (load, store, rmw) to locations that are never written to by other threads during the execution of the shader.
+
+This metadata is a performance optimization and can be dropped if necessary.
+Using this metadata on a memory access to a location that is written to by other threads is undefined behavior.
+
+Sets ``NV=1`` on the instruction if the target supports it.
+
+.. code-block:: llvm
+
+  %val = load i32, ptr %in, align 4, !amdgpu.non.volatile !{}
+
+
+.. note::
+
+  This metadata is used to request ``NV=1`` on an operation, but the compiler may also set ``NV=1``
+  on memory accesses that do not have the metadata when it is safe to do so. For example, it may
+  set it on accesses to constant memory, when loading from or storing to scratch memory used for
+  spills, etc.
 
 LLVM IR Attributes
 ==================
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cdf6fb97d0b3b..d706fb0484856 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19400,6 +19400,8 @@ SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
     Flags |= MONoClobber;
   if (I.getMetadata("amdgpu.last.use"))
     Flags |= MOLastUse;
+  if (I.getMetadata("amdgpu.non.volatile"))
+    Flags |= MONonVolatile;
   return Flags;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-non-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-non-volatile.ll
index ab12e3c19992d..5376bb2abe72f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-non-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-non-volatile.ll
@@ -58,6 +58,35 @@ entry:
   ret i32 %val
 }
 
+define void @md_nv__flat_i32_nonatomic(ptr addrspace(0) %in, ptr addrspace(0) %out) {
+; GFX12-CU-LABEL: md_nv__flat_i32_nonatomic:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_expcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    flat_store_b32 v[2:3], v0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: md_nv__flat_i32_nonatomic:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    flat_load_b32 v0, v[0:1] nv
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    flat_store_b32 v[2:3], v0 nv
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %val = load i32, ptr addrspace(0) %in, !amdgpu.non.volatile !0
+  store i32 %val, ptr addrspace(0) %out, !amdgpu.non.volatile !0
+  ret void
+}
+
 define void @global_i32_nonatomic(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GFX12-CU-LABEL: global_i32_nonatomic:
 ; GFX12-CU:       ; %bb.0: ; %entry
@@ -109,6 +138,63 @@ entry:
   ret i32 %val
 }
 
+define void @md_nv__global_i32_nonatomic(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GFX12-CU-LABEL: md_nv__global_i32_nonatomic:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_expcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    global_store_b32 v[2:3], v0, off
+; GFX12-CU-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: md_nv__global_i32_nonatomic:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v0, v[0:1], off nv
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v[2:3], v0, off nv
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %val = load i32, ptr addrspace(1) %in, !amdgpu.non.volatile !0
+  store i32 %val, ptr addrspace(1) %out, !amdgpu.non.volatile !0
+  ret void
+}
+
+; DS does not have nv.
+define void @lds_i32_nonatomic(ptr addrspace(3) %in, ptr addrspace(3) %out) {
+; GFX12-CU-LABEL: lds_i32_nonatomic:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_expcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    ds_load_b32 v0, v0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    ds_store_b32 v1, v0
+; GFX12-CU-NEXT:    s_wait_dscnt 0x0
+; GFX12-CU-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: lds_i32_nonatomic:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    ds_load_b32 v0, v0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    ds_store_b32 v1, v0
+; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %val = load i32, ptr addrspace(3) %in, !amdgpu.non.volatile !0
+  store i32 %val, ptr addrspace(3) %out, !amdgpu.non.volatile !0
+  ret void
+}
+
 define i32 @scalar_i32_nonatomic(ptr addrspace(4) inreg %in) {
 ; GFX12-CU-LABEL: scalar_i32_nonatomic:
 ; GFX12-CU:       ; %bb.0: ; %entry
@@ -161,6 +247,32 @@ entry:
   ret i32 %val
 }
 
+define i32 @md_nv__scalar_i32_nonatomic(ptr addrspace(4) inreg %in) {
+; GFX12-CU-LABEL: md_nv__scalar_i32_nonatomic:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_expcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: md_nv__scalar_i32_nonatomic:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_load_b32 s0, s[0:1], 0x0 nv
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %val = load i32, ptr addrspace(4) %in, !amdgpu.non.volatile !0
+  ret i32 %val
+}
+
 define void @scratch_i32_nonatomic(ptr addrspace(5) %in, ptr addrspace(5) %out) {
 ; GFX12-CU-LABEL: scratch_i32_nonatomic:
 ; GFX12-CU:       ; %bb.0: ; %entry
@@ -212,6 +324,33 @@ entry:
   ret i32 %val
 }
 
+define void @md_nv__scratch_i32_nonatomic(ptr addrspace(5) %in, ptr addrspace(5) %out) {
+; GFX12-CU-LABEL: md_nv__scratch_i32_nonatomic:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_expcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, off
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    scratch_store_b32 v1, v0, off
+; GFX12-CU-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: md_nv__scratch_i32_nonatomic:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    scratch_load_b32 v0, v0, off nv
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    scratch_store_b32 v1, v0, off nv
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %val = load i32, ptr addrspace(5) %in, !amdgpu.non.volatile !0
+  store i32 %val, ptr addrspace(5) %out, !amdgpu.non.volatile !0
+  ret void
+}
+
 define i32 @scalar32_i32_nonatomic(ptr addrspace(6) inreg %in) {
 ; GFX12-CU-LABEL: scalar32_i32_nonatomic:
 ; GFX12-CU:       ; %bb.0: ; %entry
@@ -268,6 +407,34 @@ entry:
   ret i32 %val
 }
 
+define i32 @md_nv__scalar32_i32_nonatomic(ptr addrspace(6) inreg %in) {
+; GFX12-CU-LABEL: md_nv__scalar32_i32_nonatomic:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_expcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    s_mov_b32 s1, 0
+; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-CU-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: md_nv__scalar32_i32_nonatomic:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_mov_b32 s1, 0
+; GFX1250-NEXT:    s_load_b32 s0, s[0:1], 0x0 nv
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %val = load i32, ptr addrspace(6) %in, !amdgpu.non.volatile !0
+  ret i32 %val
+}
+
 define void @buffer_i32_nonatomic(ptr addrspace(7) inreg %in, ptr addrspace(7) inreg %out) {
 ; GFX12-CU-DAGISEL-LABEL: buffer_i32_nonatomic:
 ; GFX12-CU-DAGISEL:       ; %bb.0: ; %entry
@@ -362,4 +529,55 @@ entry:
   ret i32 %val
 }
 
+define void @md_nv__buffer_i32_nonatomic(ptr addrspace(7) inreg %in, ptr addrspace(7) inreg %out) {
+; GFX12-CU-LABEL: md_nv__buffer_i32_nonatomic:
+; GFX12-CU:       ; %bb.0: ; %entry
+; GFX12-CU-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-CU-NEXT:    s_wait_expcnt 0x0
+; GFX12-CU-NEXT:    s_wait_samplecnt 0x0
+; GFX12-CU-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-CU-NEXT:    s_wait_kmcnt 0x0
+; GFX12-CU-NEXT:    v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s21
+; GFX12-CU-NEXT:    s_mov_b32 s7, s20
+; GFX12-CU-NEXT:    s_mov_b32 s6, s19
+; GFX12-CU-NEXT:    s_mov_b32 s5, s18
+; GFX12-CU-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen
+; GFX12-CU-NEXT:    s_mov_b32 s4, s17
+; GFX12-CU-NEXT:    s_wait_loadcnt 0x0
+; GFX12-CU-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen
+; GFX12-CU-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-DAGISEL-LABEL: md_nv__buffer_i32_nonatomic:
+; GFX1250-DAGISEL:       ; %bb.0: ; %entry
+; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-DAGISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-DAGISEL-NEXT:    v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s21
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s7, s20
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s6, s19
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s5, s18
+; GFX1250-DAGISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen nv
+; GFX1250-DAGISEL-NEXT:    s_mov_b32 s4, s17
+; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-DAGISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen nv
+; GFX1250-DAGISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: md_nv__buffer_i32_nonatomic:
+; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s21
+; GFX1250-GISEL-NEXT:    s_mov_b32 s4, s17
+; GFX1250-GISEL-NEXT:    s_mov_b32 s5, s18
+; GFX1250-GISEL-NEXT:    s_mov_b32 s6, s19
+; GFX1250-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen nv
+; GFX1250-GISEL-NEXT:    s_mov_b32 s7, s20
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen nv
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+entry:
+  %val = load i32, ptr addrspace(7) %in, !amdgpu.non.volatile !0
+  store i32 %val, ptr addrspace(7) %out, !amdgpu.non.volatile !0
+  ret void
+}
+
 !0 = !{}

>From 33cb8640ef8b784840160e02b48cba51704b731e Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Tue, 3 Feb 2026 14:35:43 +0100
Subject: [PATCH 3/3] Rename to MOThreadPrivate

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d706fb0484856..d46f911e824cd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19401,7 +19401,7 @@ SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
   if (I.getMetadata("amdgpu.last.use"))
     Flags |= MOLastUse;
   if (I.getMetadata("amdgpu.non.volatile"))
-    Flags |= MONonVolatile;
+    Flags |= MOThreadPrivate;
   return Flags;
 }