[llvm] [AMDGPU] Multi dword spilling for unaligned tuples (PR #183701)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 8 21:09:05 PDT 2026
https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/183701
>From c6b378a9a4a2d47faacdcf765c574b5331607e73 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Thu, 22 Jan 2026 10:43:49 +0530
Subject: [PATCH] [AMDGPU] Multi dword spilling for unaligned tuples
While spilling unaligned tuples, rather than breaking the
spill into 32-bit accesses, spill the first register as a
single 32-bit spill, and spill the remainder of the tuple
as an aligned tuple.
Some additional bookkeeping is required in the spilling
loop to manage the state.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 78 +++---
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 70 ++++-
llvm/test/CodeGen/AMDGPU/vgpr-spill.mir | 319 +++++++++++++++++++---
4 files changed, 386 insertions(+), 83 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 996c92a8b5078..4f53a5383d776 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5302,7 +5302,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
// FIXME: We do not verify inline asm operands, but custom inline asm
// verification is broken anyway
if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
- Opcode != AMDGPU::V_MOV_B64_PSEUDO) {
+ Opcode != AMDGPU::V_MOV_B64_PSEUDO && !isSpill(MI)) {
const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
if (const TargetRegisterClass *SubRC =
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cde352313f86a..f351a91520e07 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1231,19 +1231,19 @@ multiclass SI_SPILL_VGPR <SIRegisterClassLike vgpr_class,
defm SI_SPILL_V16 : SI_SPILL_VGPR <VGPR_16>;
defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
-defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64_AlignTarget>;
-defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96_AlignTarget>;
-defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128_AlignTarget>;
-defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160_AlignTarget>;
-defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192_AlignTarget>;
-defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224_AlignTarget>;
-defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256_AlignTarget>;
-defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288_AlignTarget>;
-defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320_AlignTarget>;
-defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352_AlignTarget>;
-defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384_AlignTarget>;
-defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512_AlignTarget>;
-defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024_AlignTarget>;
+defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
+defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
+defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
+defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
+defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
+defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>;
+defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
+defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288>;
+defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320>;
+defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352>;
+defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>;
+defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
let Defs = [M0] in {
// Spills a block of 32 VGPRs. M0 will contain a mask describing which
@@ -1252,34 +1252,34 @@ let Defs = [M0] in {
}
defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>;
-defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64_AlignTarget, 1>;
-defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96_AlignTarget, 1>;
-defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128_AlignTarget, 1>;
-defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160_AlignTarget, 1>;
-defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192_AlignTarget, 1>;
-defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224_AlignTarget, 1>;
-defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256_AlignTarget, 1>;
-defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288_AlignTarget, 1>;
-defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320_AlignTarget, 1>;
-defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352_AlignTarget, 1>;
-defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384_AlignTarget, 1>;
-defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512_AlignTarget, 1>;
-defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024_AlignTarget, 1>;
+defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>;
+defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>;
+defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>;
+defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>;
+defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>;
+defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>;
+defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
+defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288, 1>;
+defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320, 1>;
+defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352, 1>;
+defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384, 1>;
+defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
+defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>;
-defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64_AlignTarget, 1>;
-defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96_AlignTarget, 1>;
-defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128_AlignTarget, 1>;
-defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160_AlignTarget, 1>;
-defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192_AlignTarget, 1>;
-defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224_AlignTarget, 1>;
-defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256_AlignTarget, 1>;
-defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288_AlignTarget, 1>;
-defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320_AlignTarget, 1>;
-defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352_AlignTarget, 1>;
-defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384_AlignTarget, 1>;
-defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512_AlignTarget, 1>;
-defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024_AlignTarget, 1>;
+defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>;
+defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>;
+defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>;
+defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>;
+defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>;
+defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>;
+defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>;
+defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288, 1>;
+defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320, 1>;
+defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352, 1>;
+defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>;
+defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>;
+defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>;
let isConvergent = 1 in {
defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ee461575b509f..e9d12b116f511 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1536,15 +1536,52 @@ void SIRegisterInfo::buildSpillLoadStore(
const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
+ // On targets with register tuple alignment requirements,
+ // for unaligned tuples, spill the first sub-reg as a 32-bit spill,
+ // and spill the rest as a regular aligned tuple.
+ // eg: SPILL_V224 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ // will be spilt as:
+ // SPILL_SCRATCH_DWORD $vgpr1
+ // SPILL_SCRATCH_DWORDx4 $vgpr2_vgpr3_vgpr4_vgpr5
+ // SPILL_SCRATCH_DWORDx2 $vgpr6_vgpr7
+ bool IsRegMisaligned = false;
+ if (!IsBlock && RegWidth > 4) {
+ unsigned SpillOpcode =
+ getFlatScratchSpillOpcode(TII, LoadStoreOp, std::min(RegWidth, 16u));
+ int VDataIdx =
+ IsStore ? AMDGPU::getNamedOperandIdx(SpillOpcode, AMDGPU::OpName::vdata)
+ : 0; // Restore Ops have data reg as the first (output) operand.
+ const TargetRegisterClass *ExpectedRC =
+ TII->getRegClass(TII->get(SpillOpcode), VDataIdx);
+ unsigned NumRegs = std::min(RegWidth / 4, 4u);
+ unsigned SubIdx = getSubRegFromChannel(0, NumRegs);
+ const TargetRegisterClass *MatchRC = findCommonRegClass(
+ RC, getRegSizeInBits(*ExpectedRC) == getRegSizeInBits(*RC) ? 0 : SubIdx,
+ ExpectedRC, 0);
+ if (MatchRC && !MatchRC->contains(ValueReg))
+ IsRegMisaligned = true;
+ }
// Always use 4 byte operations for AGPRs because we need to scavenge
// a temporary VGPR.
// If we're using a block operation, the element should be the whole block.
- unsigned EltSize = IsBlock ? RegWidth
- : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
- : 4u;
+ // clang-format off
+ unsigned EltSize = IsBlock ? RegWidth
+ : (IsFlat && !IsAGPR)
+ ? std::min(RegWidth, 16u)
+ : 4u;
+ // clang-format on
unsigned NumSubRegs = RegWidth / EltSize;
unsigned Size = NumSubRegs * EltSize;
unsigned RemSize = RegWidth - Size;
+ // For unaligned tuples, the first sub-reg is spilt as a single 32-bit spill,
+ // and will count as an additional reg, so the last chunk will have one less
+ // register. In some cases, the last chunk could be completly eliminated,
+ // eg: SPILL_V160 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 will be spilt as:
+ // SPILL_SCRATCH_DWORD $vgpr1
+ // SPILL_SCRATCH_DWORD $vgpr2_vgpr3_vgpr4_vgpr5
+ unsigned LastChunk = ((RemSize / 4) + 3) % 4;
+ if (IsRegMisaligned && LastChunk)
+ NumSubRegs += 1;
unsigned NumRemSubRegs = RemSize ? 1 : 0;
int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
int64_t MaterializedOffset = Offset;
@@ -1708,10 +1745,31 @@ void SIRegisterInfo::buildSpillLoadStore(
for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
++i, RegOffset += EltSize) {
- if (i == NumSubRegs) {
- EltSize = RemSize;
+ unsigned SavedEltSize = EltSize;
+ if (i == 0 && IsRegMisaligned) {
+ // For misaligned register tuples, spill only the first sub-reg in the
+ // first iteration.
+ EltSize = 4u;
+ LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
+ }
+ if (i == 1 && IsRegMisaligned) {
+ // The first sub-reg was split in the previous iteration.
+ RegOffset = 4u;
+ if (RegWidth <= 16)
+ EltSize = RegWidth - 4u;
LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
}
+ if (IsRegMisaligned) {
+ if (i == (e - 1)) {
+ EltSize = LastChunk * 4;
+ LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
+ }
+ } else {
+ if (i == NumSubRegs) {
+ EltSize = RemSize;
+ LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
+ }
+ }
Desc = &TII->get(LoadStoreOp);
if (!IsFlat && UseVGPROffset) {
@@ -1945,6 +2003,8 @@ void SIRegisterInfo::buildSpillLoadStore(
// scavenged.
if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
+ if (i == 0 && IsRegMisaligned)
+ EltSize = SavedEltSize;
}
if (ScratchOffsetRegDelta != 0) {
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir
index 93fc2857d5092..038c0ca05fd50 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir
@@ -1,5 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GFX900 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GFX942 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GFX12,GFX1200 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GFX12,GFX1250 %s
---
name: spill_v32
@@ -14,11 +17,23 @@ body: |
bb.0:
liveins: $vgpr0
- ; CHECK-LABEL: name: spill_v32
- ; CHECK: liveins: $vgpr0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; CHECK-NEXT: S_NOP 0, implicit $vgpr0
+ ; GFX900-LABEL: name: spill_v32
+ ; GFX900: liveins: $vgpr0
+ ; GFX900-NEXT: {{ $}}
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX900-NEXT: S_NOP 0, implicit $vgpr0
+ ;
+ ; GFX942-LABEL: name: spill_v32
+ ; GFX942: liveins: $vgpr0
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX942-NEXT: S_NOP 0, implicit $vgpr0
+ ;
+ ; GFX12-LABEL: name: spill_v32
+ ; GFX12: liveins: $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX12-NEXT: S_NOP 0, implicit $vgpr0
SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
S_NOP 0, implicit $vgpr0
...
@@ -36,10 +51,20 @@ body: |
bb.0:
liveins: $vgpr0
- ; CHECK-LABEL: name: spill_v32_kill
- ; CHECK: liveins: $vgpr0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX900-LABEL: name: spill_v32_kill
+ ; GFX900: liveins: $vgpr0
+ ; GFX900-NEXT: {{ $}}
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ;
+ ; GFX942-LABEL: name: spill_v32_kill
+ ; GFX942: liveins: $vgpr0
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ;
+ ; GFX12-LABEL: name: spill_v32_kill
+ ; GFX12: liveins: $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
...
@@ -56,12 +81,24 @@ body: |
bb.0:
liveins: $vgpr0_vgpr1
- ; CHECK-LABEL: name: spill_v64
- ; CHECK: liveins: $vgpr0_vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; CHECK-NEXT: S_NOP 0, implicit $vgpr0_vgpr1
+ ; GFX900-LABEL: name: spill_v64
+ ; GFX900: liveins: $vgpr0_vgpr1
+ ; GFX900-NEXT: {{ $}}
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
+ ; GFX900-NEXT: S_NOP 0, implicit $vgpr0_vgpr1
+ ;
+ ; GFX942-LABEL: name: spill_v64
+ ; GFX942: liveins: $vgpr0_vgpr1
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX2_SADDR $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s64) into %stack.0, align 4, addrspace 5)
+ ; GFX942-NEXT: S_NOP 0, implicit $vgpr0_vgpr1
+ ;
+ ; GFX12-LABEL: name: spill_v64
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: SCRATCH_STORE_DWORDX2_SADDR $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s64) into %stack.0, align 4, addrspace 5)
+ ; GFX12-NEXT: S_NOP 0, implicit $vgpr0_vgpr1
SI_SPILL_V64_SAVE $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
S_NOP 0, implicit $vgpr0_vgpr1
...
@@ -79,11 +116,21 @@ body: |
bb.0:
liveins: $vgpr0_vgpr1
- ; CHECK-LABEL: name: spill_v64_kill
- ; CHECK: liveins: $vgpr0_vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
+ ; GFX900-LABEL: name: spill_v64_kill
+ ; GFX900: liveins: $vgpr0_vgpr1
+ ; GFX900-NEXT: {{ $}}
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
+ ;
+ ; GFX942-LABEL: name: spill_v64_kill
+ ; GFX942: liveins: $vgpr0_vgpr1
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s64) into %stack.0, align 4, addrspace 5)
+ ;
+ ; GFX12-LABEL: name: spill_v64_kill
+ ; GFX12: liveins: $vgpr0_vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s64) into %stack.0, align 4, addrspace 5)
SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
...
@@ -102,11 +149,21 @@ body: |
bb.0:
liveins: $vgpr0
- ; CHECK-LABEL: name: spill_v64_undef_sub1_killed
- ; CHECK: liveins: $vgpr0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
+ ; GFX900-LABEL: name: spill_v64_undef_sub1_killed
+ ; GFX900: liveins: $vgpr0
+ ; GFX900-NEXT: {{ $}}
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
+ ;
+ ; GFX942-LABEL: name: spill_v64_undef_sub1_killed
+ ; GFX942: liveins: $vgpr0
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s64) into %stack.0, align 4, addrspace 5)
+ ;
+ ; GFX12-LABEL: name: spill_v64_undef_sub1_killed
+ ; GFX12: liveins: $vgpr0
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s64) into %stack.0, align 4, addrspace 5)
SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
...
@@ -123,11 +180,21 @@ body: |
bb.0:
liveins: $vgpr1
- ; CHECK-LABEL: name: spill_v64_undef_sub0_killed
- ; CHECK: liveins: $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
+ ; GFX900-LABEL: name: spill_v64_undef_sub0_killed
+ ; GFX900: liveins: $vgpr1
+ ; GFX900-NEXT: {{ $}}
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
+ ;
+ ; GFX942-LABEL: name: spill_v64_undef_sub0_killed
+ ; GFX942: liveins: $vgpr1
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s64) into %stack.0, align 4, addrspace 5)
+ ;
+ ; GFX12-LABEL: name: spill_v64_undef_sub0_killed
+ ; GFX12: liveins: $vgpr1
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: SCRATCH_STORE_DWORDX2_SADDR killed $vgpr0_vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s64) into %stack.0, align 4, addrspace 5)
SI_SPILL_V64_SAVE killed $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
...
@@ -144,13 +211,23 @@ body: |
bb.0:
liveins: $vgpr0_vgpr1_vgpr2_vgpr3
- ; CHECK-LABEL: name: spill_v128_kill
- ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
- ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
+ ; GFX900-LABEL: name: spill_v128_kill
+ ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX900-NEXT: {{ $}}
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
+ ;
+ ; GFX942-LABEL: name: spill_v128_kill
+ ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s128) into %stack.0, align 4, addrspace 5)
+ ;
+ ; GFX12-LABEL: name: spill_v128_kill
+ ; GFX12: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s128) into %stack.0, align 4, addrspace 5)
SI_SPILL_V128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5)
...
@@ -187,3 +264,169 @@ body: |
SI_SPILL_V64_SAVE undef $vgpr0_vgpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, addrspace 5)
S_NOP 0, implicit undef $vgpr0_vgpr1
...
+
+---
+name: spill_v128_kill_unaligned
+tracksRegLiveness: true
+stack:
+ - { id: 0, type: spill-slot, size: 16, alignment: 4 }
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ stackPtrOffsetReg: '$sgpr32'
+ frameOffsetReg: '$sgpr33'
+body: |
+ bb.0:
+ liveins: $vgpr1_vgpr2_vgpr3_vgpr4
+
+ ; GFX900-LABEL: name: spill_v128_kill_unaligned
+ ; GFX900: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
+ ; GFX900-NEXT: {{ $}}
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
+ ;
+ ; GFX942-LABEL: name: spill_v128_kill_unaligned
+ ; GFX942: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr2_vgpr3_vgpr4, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 4, align 4, addrspace 5)
+ ;
+ ; GFX1200-LABEL: name: spill_v128_kill_unaligned
+ ; GFX1200: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: SCRATCH_STORE_DWORDX4_SADDR killed $vgpr1_vgpr2_vgpr3_vgpr4, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s128) into %stack.0, align 4, addrspace 5)
+ ;
+ ; GFX1250-LABEL: name: spill_v128_kill_unaligned
+ ; GFX1250: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX1250-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr2_vgpr3_vgpr4, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 4, align 4, addrspace 5)
+ SI_SPILL_V128_SAVE killed $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5)
+...
+
+---
+name: spill_v128_unaligned
+tracksRegLiveness: true
+stack:
+ - { id: 0, type: spill-slot, size: 16, alignment: 4 }
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ stackPtrOffsetReg: '$sgpr32'
+ frameOffsetReg: '$sgpr33'
+body: |
+ bb.0:
+ liveins: $vgpr1_vgpr2_vgpr3_vgpr4
+
+ ; GFX900-LABEL: name: spill_v128_unaligned
+ ; GFX900: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
+ ; GFX900-NEXT: {{ $}}
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
+ ;
+ ; GFX942-LABEL: name: spill_v128_unaligned
+ ; GFX942: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX3_SADDR $vgpr2_vgpr3_vgpr4, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 4, align 4, addrspace 5)
+ ;
+ ; GFX1200-LABEL: name: spill_v128_unaligned
+ ; GFX1200: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr1_vgpr2_vgpr3_vgpr4, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s128) into %stack.0, align 4, addrspace 5)
+ ;
+ ; GFX1250-LABEL: name: spill_v128_unaligned
+ ; GFX1250: liveins: $vgpr1_vgpr2_vgpr3_vgpr4
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX1250-NEXT: SCRATCH_STORE_DWORDX3_SADDR $vgpr2_vgpr3_vgpr4, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 4, align 4, addrspace 5)
+ SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5)
+...
+
+---
+name: spill_v256_aligned
+tracksRegLiveness: true
+stack:
+ - { id: 0, type: spill-slot, size: 16, alignment: 4 }
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ stackPtrOffsetReg: '$sgpr32'
+ frameOffsetReg: '$sgpr33'
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+
+ ; GFX900-LABEL: name: spill_v256_aligned
+ ; GFX900: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX900-NEXT: {{ $}}
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 16, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 20, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 24, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 28, addrspace 5)
+ ;
+ ; GFX942-LABEL: name: spill_v256_aligned
+ ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: ("amdgpu-thread-private" store (s128) into %stack.0, align 4, addrspace 5)
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: ("amdgpu-thread-private" store (s128) into %stack.0 + 16, align 4, addrspace 5)
+ ;
+ ; GFX12-LABEL: name: spill_v256_aligned
+ ; GFX12: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: ("amdgpu-thread-private" store (s128) into %stack.0, align 4, addrspace 5)
+ ; GFX12-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr4_vgpr5_vgpr6_vgpr7, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 :: ("amdgpu-thread-private" store (s128) into %stack.0 + 16, align 4, addrspace 5)
+ SI_SPILL_V256_SAVE $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5)
+...
+
+---
+name: spill_v256_unaligned
+tracksRegLiveness: true
+stack:
+ - { id: 0, type: spill-slot, size: 16, alignment: 4 }
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ stackPtrOffsetReg: '$sgpr32'
+ frameOffsetReg: '$sgpr33'
+body: |
+ bb.0:
+ liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
+
+ ; GFX900-LABEL: name: spill_v256_unaligned
+ ; GFX900: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
+ ; GFX900-NEXT: {{ $}}
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 16, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 20, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: ("amdgpu-thread-private" store (s32) into %stack.0 + 24, addrspace 5)
+ ; GFX900-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 28, addrspace 5)
+ ;
+ ; GFX942-LABEL: name: spill_v256_unaligned
+ ; GFX942: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s128) into %stack.0 + 4, align 4, addrspace 5)
+ ; GFX942-NEXT: SCRATCH_STORE_DWORDX3_SADDR $vgpr6_vgpr7_vgpr8, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 20, align 4, addrspace 5)
+ ;
+ ; GFX1200-LABEL: name: spill_v256_unaligned
+ ; GFX1200: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
+ ; GFX1200-NEXT: {{ $}}
+ ; GFX1200-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr1_vgpr2_vgpr3_vgpr4, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s128) into %stack.0, align 4, addrspace 5)
+ ; GFX1200-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr5_vgpr6_vgpr7_vgpr8, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s128) into %stack.0 + 16, align 4, addrspace 5)
+ ;
+ ; GFX1250-LABEL: name: spill_v256_unaligned
+ ; GFX1250: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
+ ; GFX1250-NEXT: {{ $}}
+ ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5)
+ ; GFX1250-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s128) into %stack.0 + 4, align 4, addrspace 5)
+ ; GFX1250-NEXT: SCRATCH_STORE_DWORDX3_SADDR $vgpr6_vgpr7_vgpr8, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 20, align 4, addrspace 5)
+ SI_SPILL_V256_SAVE $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5)
+...
More information about the llvm-commits
mailing list