[llvm-branch-commits] [llvm] [AMDGPU] Multi dword spilling for unaligned tuples (PR #183701)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Feb 27 00:51:06 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Aaditya (easyonaadit)
<details>
<summary>Changes</summary>
While spilling unaligned tuples, rather than breaking the
spill into 32-bit accesses, spill the first register as a single
32-bit spill, and spill the remainder of the tuple as an aligned tuple.
Some additional bookkeeping is required in the spilling
loop to manage the state.
References: https://github.com/llvm/llvm-project/pull/177317
---
Full diff: https://github.com/llvm/llvm-project/pull/183701.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+44-10)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-spill.mir (+8-26)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 29d6937b1f5ff..752abe9ad9a4e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1537,9 +1537,13 @@ void SIRegisterInfo::buildSpillLoadStore(
const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
// On targets with register tuple alignment requirements,
- // for unaligned tuples, break the spill into 32-bit pieces.
- // TODO: Optimize misaligned spills by using larger aligned chunks instead of
- // 32-bit splits.
+ // for unaligned tuples, spill the first sub-reg as a 32-bit spill,
+ // and spill the rest as a regular aligned tuple.
+ // eg: SPILL_V224 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ // will be spilt as:
+ // SPILL_SCRATCH_DWORD $vgpr1
+ // SPILL_SCRATCH_DWORDx4 $vgpr2_vgpr3_vgpr4_vgpr5
+ // SPILL_SCRATCH_DWORDx2 $vgpr6_vgpr7
bool IsRegMisaligned = false;
if (!IsBlock && RegWidth > 4) {
unsigned SpillOpcode =
@@ -1560,14 +1564,21 @@ void SIRegisterInfo::buildSpillLoadStore(
// Always use 4 byte operations for AGPRs because we need to scavenge
// a temporary VGPR.
// If we're using a block operation, the element should be the whole block.
- // For misaligned registers, use 4-byte elements to avoid alignment errors.
- unsigned EltSize = IsBlock ? RegWidth
- : (IsFlat && !IsAGPR && !IsRegMisaligned)
- ? std::min(RegWidth, 16u)
- : 4u;
+ unsigned EltSize = IsBlock ? RegWidth
+ : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
+ : 4u;
unsigned NumSubRegs = RegWidth / EltSize;
unsigned Size = NumSubRegs * EltSize;
unsigned RemSize = RegWidth - Size;
+ // For unaligned tuples, the first sub-reg is spilt as a single 32-bit spill,
+ // and will count as an additional reg, so the last chunk will have one less
+ // register. In some cases, the last chunk could be completly eliminated,
+ // eg: SPILL_V160 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 will be spilt as:
+ // SPILL_SCRATCH_DWORD $vgpr1
+ // SPILL_SCRATCH_DWORD $vgpr2_vgpr3_vgpr4_vgpr5
+ unsigned LastChunk = ((RemSize / 4) + 3) % 4;
+ if (IsRegMisaligned && LastChunk)
+ NumSubRegs += 1;
unsigned NumRemSubRegs = RemSize ? 1 : 0;
int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
int64_t MaterializedOffset = Offset;
@@ -1731,10 +1742,31 @@ void SIRegisterInfo::buildSpillLoadStore(
for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
++i, RegOffset += EltSize) {
- if (i == NumSubRegs) {
- EltSize = RemSize;
+ unsigned SavedEltSize = EltSize;
+ if (i == 0 && IsRegMisaligned) {
+ // For misaligned register tuples, spill only the first sub-reg in the
+ // first iteration.
+ EltSize = 4u;
LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
}
+ if (i == 1 && IsRegMisaligned) {
+ // The first sub-reg was split in the previous iteration.
+ RegOffset = 4u;
+ if (RegWidth <= 16)
+ EltSize = RegWidth - 4u;
+ LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
+ }
+ if (IsRegMisaligned) {
+ if (i == (e - 1)) {
+ EltSize = LastChunk * 4;
+ LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
+ }
+ } else {
+ if (i == NumSubRegs) {
+ EltSize = RemSize;
+ LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
+ }
+ }
Desc = &TII->get(LoadStoreOp);
if (!IsFlat && UseVGPROffset) {
@@ -1968,6 +2000,8 @@ void SIRegisterInfo::buildSpillLoadStore(
// scavenged.
if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
+ if (i == 0 && IsRegMisaligned)
+ EltSize = SavedEltSize;
}
if (ScratchOffsetRegDelta != 0) {
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir
index 82111165c0127..038c0ca05fd50 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir
@@ -290,9 +290,7 @@ body: |
; GFX942: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr2_vgpr3_vgpr4, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 4, align 4, addrspace 5)
;
; GFX1200-LABEL: name: spill_v128_kill_unaligned
; GFX1200: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
@@ -303,9 +301,7 @@ body: |
; GFX1250: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
+ ; GFX1250-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr2_vgpr3_vgpr4, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 4, align 4, addrspace 5)
SI_SPILL_V128_SAVE killed $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5)
...
@@ -334,9 +330,7 @@ body: |
; GFX942: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX3_SADDR $vgpr2_vgpr3_vgpr4, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 4, align 4, addrspace 5)
;
; GFX1200-LABEL: name: spill_v128_unaligned
; GFX1200: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
@@ -347,9 +341,7 @@ body: |
; GFX1250: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
+ ; GFX1250-NEXT: SCRATCH_STORE_DWORDX3_SADDR $vgpr2_vgpr3_vgpr4, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 4, align 4, addrspace 5)
SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5)
...
@@ -421,13 +413,8 @@ body: |
; GFX942: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
; GFX942-NEXT: {{ $}}
; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 16, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr6, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 20, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr7, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 24, addrspace 5)
- ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 28, addrspace 5)
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s128) into %stack.0 + 4, align 4, addrspace 5)
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX3_SADDR $vgpr6_vgpr7_vgpr8, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 20, align 4, addrspace 5)
;
; GFX1200-LABEL: name: spill_v256_unaligned
; GFX1200: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
@@ -439,12 +426,7 @@ body: |
; GFX1250: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
; GFX1250-NEXT: {{ $}}
; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 16, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr6, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 20, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr7, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 24, addrspace 5)
- ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 28, addrspace 5)
+ ; GFX1250-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s128) into %stack.0 + 4, align 4, addrspace 5)
+ ; GFX1250-NEXT: SCRATCH_STORE_DWORDX3_SADDR $vgpr6_vgpr7_vgpr8, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 20, align 4, addrspace 5)
SI_SPILL_V256_SAVE $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5)
...
``````````
</details>
https://github.com/llvm/llvm-project/pull/183701
More information about the llvm-branch-commits
mailing list