[llvm] [AMDGPU] Fix LDS address correction in promoteConstantOffsetToImm for async stores (PR #180220)
Alexander Weinrauch via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 10 04:23:57 PST 2026
https://github.com/AlexAUT updated https://github.com/llvm/llvm-project/pull/180220
>From cd1cb0cf8c645c534a75e79ea1a2efee311a9a7d Mon Sep 17 00:00:00 2001
From: Alexander Weinrauch <alexander.weinrauch at amd.com>
Date: Fri, 6 Feb 2026 15:23:14 +0000
Subject: [PATCH 1/4] Support async_store_from_lds
---
.../Target/AMDGPU/SILoadStoreOptimizer.cpp | 24 ++++++----
.../promote-constOffset-to-imm-gfx12.ll | 46 +++++++++++++++++++
.../promote-constOffset-to-imm-gfx12.mir | 33 +++++++++++++
3 files changed, 95 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 594c35a1dee3b..2d7870878aa11 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2369,24 +2369,32 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
}
-// Maintain the correct LDS address for async loads.
-// It becomes incorrect when promoteConstantOffsetToImm
-// adds an offset only meant for the src operand.
+// Maintain the correct LDS address for async loads and stores.
+// It becomes incorrect when promoteConstantOffsetToImm adds an offset only
+// meant for the global address operand. For async loads the LDS address is in
+// vdst. For async stores, the LDS address is in vdata.
void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI,
int32_t OffsetDiff) const {
if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0)
return;
- Register OldVDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
- Register NewVDst = MRI->createVirtualRegister(MRI->getRegClass(OldVDst));
+ MachineOperand *LDSAddr =
+ TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (!LDSAddr)
+ LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+ if (!LDSAddr)
+ return;
+
+ Register OldReg = LDSAddr->getReg();
+ Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(OldReg));
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewVDst)
- .addReg(OldVDst)
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewReg)
+ .addReg(OldReg)
.addImm(-OffsetDiff)
.addImm(0);
- MI.getOperand(0).setReg(NewVDst);
+ LDSAddr->setReg(NewReg);
}
bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
index f557318cffb1f..318b5b5c6190b 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
@@ -95,3 +95,49 @@ entry:
ret void
}
+
+; Test async_store_from_lds
+define amdgpu_kernel void @promote_async_store_offset_negative(ptr addrspace(1) %dst) {
+; GFX1250-LABEL: promote_async_store_offset_negative:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, 0x100, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_async_from_lds_b128 v0, v1, s[0:1]
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1]
+; GFX1250-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff00
+; GFX1250-NEXT: v_add_nc_u32_e64 v0, 0xfffffe00, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[2:3]
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_async_from_lds_b128 v[2:3], v0, off offset:512
+; GFX1250-NEXT: global_store_async_from_lds_b128 v[2:3], v1, off
+; GFX1250-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.offset = shl i32 %tid, 0
+ %lds.gep = getelementptr i8, ptr addrspace(3) @lds, i32 0
+
+ ; First store at base + 256
+ %offset0 = add i32 256, %gep.offset
+ %zext0 = zext i32 %offset0 to i64
+ %gep0 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext0
+ call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep0, ptr addrspace(3) %lds.gep, i32 0, i32 0)
+
+ ; Second store at base + 512 (+512 from 0)
+ %offset1 = add i32 512, %gep.offset
+ %zext1 = zext i32 %offset1 to i64
+ %gep1 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext1
+ call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep1, ptr addrspace(3) %lds.gep, i32 0, i32 0)
+
+ ; Final store at base + 0
+ %offset2 = add i32 0, %gep.offset
+ %zext2 = zext i32 %offset2 to i64
+ %gep2 = getelementptr i8, ptr addrspace(1) %dst, i64 %zext2
+ call void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gep2, ptr addrspace(3) %lds.gep, i32 0, i32 0)
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
index 8d8c791fbd9ae..2baf0f1362819 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
@@ -109,3 +109,36 @@ body: |
GLOBAL_LOAD_ASYNC_TO_LDS_B128 %6, killed %15, 0, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
S_ENDPGM 0
...
+
+---
+name: promote_async_store_offset
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+ frameOffsetReg: '$sgpr33'
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $sgpr0_sgpr1, $ttmp7
+ ; GFX1250-LABEL: name: promote_async_store_offset
+ ; GFX1250: liveins: $ttmp7, $vgpr0, $sgpr0_sgpr1
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: renamable $vgpr1 = V_LSHLREV_B32_e32 8, $vgpr0, implicit $exec
+ ; GFX1250-NEXT: renamable $vgpr2, renamable $vcc_lo = V_ADD_CO_U32_e64 $vgpr0, 512, 0, implicit $exec
+ ; GFX1250-NEXT: renamable $vgpr3, dead $sgpr_null = V_ADDC_U32_e64 0, killed $vgpr0, killed $vcc_lo, 0, implicit $exec
+ ; GFX1250-NEXT: renamable $vgpr1 = disjoint V_OR_B32_e32 0, killed $vgpr1, implicit $exec
+ ; GFX1250-NEXT: renamable $vgpr0 = V_ADD_U32_e32 256, $vgpr1, implicit $exec
+ ; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B128 $vgpr2_vgpr3, killed $vgpr0, -256, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
+ ; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed $vgpr2_vgpr3, killed $vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = V_LSHLREV_B32_e64 8, %0, implicit $exec
+ %2:vgpr_32 = disjoint V_OR_B32_e64 %1, 0, implicit $exec
+ %3:vgpr_32 = disjoint V_OR_B32_e64 %1, 0, implicit $exec
+ %4:vgpr_32, %5:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0, 256, 0, implicit $exec
+ %6:vgpr_32, %7:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0, 0, killed %5, 0, implicit $exec
+ %8:vreg_64_align2 = REG_SEQUENCE %4, %subreg.sub0, %6, %subreg.sub1
+ GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed %8, killed %2, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
+ %9:vgpr_32, %10:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %0, 512, 0, implicit $exec
+ %11:vgpr_32, %12:sreg_32_xm0_xexec = V_ADDC_U32_e64 %0, 0, killed %10, 0, implicit $exec
+ %13:vreg_64_align2 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1
+ GLOBAL_STORE_ASYNC_FROM_LDS_B128 killed %13, killed %3, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load store (s128), align 1, addrspace 3)
+...
>From e91f6adce7d00ced54f92815a307b11876c0c006 Mon Sep 17 00:00:00 2001
From: Alexander Weinrauch <alexander.weinrauch at amd.com>
Date: Fri, 6 Feb 2026 16:16:26 +0000
Subject: [PATCH 2/4] Fix format
---
llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 2d7870878aa11..50a5bab1f13fe 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2378,8 +2378,7 @@ void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI,
if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0)
return;
- MachineOperand *LDSAddr =
- TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ MachineOperand *LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
if (!LDSAddr)
LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
if (!LDSAddr)
>From 6882d9bf41063561435e1e53fed5fa86ab7f160a Mon Sep 17 00:00:00 2001
From: Alexander Weinrauch <alexander.weinrauch at amd.com>
Date: Fri, 6 Feb 2026 17:23:56 +0000
Subject: [PATCH 3/4] Add comments to tests
---
llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll | 4 +++-
llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir | 3 +++
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
index 318b5b5c6190b..b5b0164969212 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.ll
@@ -96,7 +96,9 @@ entry:
ret void
}
-; Test async_store_from_lds
+; Same as promote_async_load_offset_negative above, but for async stores. The
+; LDS address is in vdata instead of vdst, so this tests that
+; updateAsyncLDSAddress corrects the right operand.
define amdgpu_kernel void @promote_async_store_offset_negative(ptr addrspace(1) %dst) {
; GFX1250-LABEL: promote_async_store_offset_negative:
; GFX1250: ; %bb.0: ; %entry
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
index 2baf0f1362819..dab161ba080e5 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm-gfx12.mir
@@ -110,6 +110,9 @@ body: |
S_ENDPGM 0
...
+# Same as promote_async_load_offset above, but for async stores. The LDS address
+# is in vdata instead of vdst, so this tests that updateAsyncLDSAddress corrects
+# the right operand.
---
name: promote_async_store_offset
machineFunctionInfo:
>From 996405287d670a33debe68071e263046abc23ece Mon Sep 17 00:00:00 2001
From: Alexander Weinrauch <alexander.weinrauch at amd.com>
Date: Tue, 10 Feb 2026 12:23:17 +0000
Subject: [PATCH 4/4] Assert LDS addr
---
llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 50a5bab1f13fe..0141c365789ca 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -2381,8 +2381,7 @@ void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI,
MachineOperand *LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
if (!LDSAddr)
LDSAddr = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
- if (!LDSAddr)
- return;
+ assert(LDSAddr);
Register OldReg = LDSAddr->getReg();
Register NewReg = MRI->createVirtualRegister(MRI->getRegClass(OldReg));
More information about the llvm-commits
mailing list