[llvm] [AMDGPU][True16][CodeGen] 16bit spill support in true16 mode (PR #128060)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 25 14:08:20 PST 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/128060
>From 38a4f38ca8eaf91a53e39d4187f04a96ac5f8538 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 20 Feb 2025 12:28:32 -0500
Subject: [PATCH 1/2] spill 16 with scratch load/store
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 1 +
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 23 +-
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 +
llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir | 140 +++++++
llvm/test/CodeGen/AMDGPU/spillv16.ll | 391 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/spillv16.mir | 58 +++
7 files changed, 615 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/spillv16.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/spillv16.mir
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2cf6de73fa90c..7ecb089373692 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1580,6 +1580,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
switch (Size) {
+ case 2:
+ return AMDGPU::SI_SPILL_V16_SAVE;
case 4:
return AMDGPU::SI_SPILL_V32_SAVE;
case 8:
@@ -1807,6 +1809,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
switch (Size) {
+ case 2:
+ return AMDGPU::SI_SPILL_V16_RESTORE;
case 4:
return AMDGPU::SI_SPILL_V32_RESTORE;
case 8:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cca49ee80a60e..4ec13807dc4d8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1002,6 +1002,7 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> {
} // End UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM]
}
+defm SI_SPILL_V16 : SI_SPILL_VGPR <VGPR_16>;
defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 924aa45559366..f3e1f183e8836 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1280,6 +1280,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
+ case AMDGPU::SI_SPILL_V16_SAVE:
+ case AMDGPU::SI_SPILL_V16_RESTORE:
return 1;
default: llvm_unreachable("Invalid spill opcode");
}
@@ -2347,6 +2349,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V64_SAVE:
case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V16_SAVE:
case AMDGPU::SI_SPILL_A1024_SAVE:
case AMDGPU::SI_SPILL_A512_SAVE:
case AMDGPU::SI_SPILL_A384_SAVE:
@@ -2387,8 +2390,14 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
MFI->getStackPtrOffsetReg());
- unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
- : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ unsigned Opc;
+ if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
+ Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
+ } else {
+ Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+ : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ }
+
auto *MBB = MI->getParent();
bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
if (IsWWMRegSpill) {
@@ -2406,6 +2415,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MI->eraseFromParent();
return true;
}
+ case AMDGPU::SI_SPILL_V16_RESTORE:
case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_V96_RESTORE:
@@ -2455,8 +2465,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
MFI->getStackPtrOffsetReg());
- unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
- : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ unsigned Opc;
+ if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
+ Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
+ } else {
+ Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+ : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ }
auto *MBB = MI->getParent();
bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
if (IsWWMRegSpill) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c521d0dd3ad2d..6a92e54b69edc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2483,6 +2483,8 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
// (move from MC* level to Target* level). Return size in bits.
unsigned getRegBitWidth(unsigned RCID) {
switch (RCID) {
+ case AMDGPU::VGPR_16RegClassID:
+ case AMDGPU::VGPR_16_Lo128RegClassID:
case AMDGPU::SGPR_LO16RegClassID:
case AMDGPU::AGPR_LO16RegClassID:
return 16;
diff --git a/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir b/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir
new file mode 100644
index 0000000000000..0c694d9f49e18
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir
@@ -0,0 +1,140 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=EXPANDED %s
+
+---
+name: spill_restore_vgpr16
+tracksRegLiveness: true
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
+ - { id: 1, name: '', type: spill-slot, offset: 4, size: 4, alignment: 4 }
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: $sgpr32
+ hasSpilledVGPRs: true
+body: |
+ ; EXPANDED-LABEL: name: spill_restore_vgpr16
+ ; EXPANDED: bb.0:
+ ; EXPANDED-NEXT: successors: %bb.1(0x80000000)
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+ ; EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5)
+ ; EXPANDED-NEXT: S_NOP 0, implicit renamable $vgpr0_lo16
+ ; EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5)
+ ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: bb.1:
+ ; EXPANDED-NEXT: successors: %bb.2(0x80000000)
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: S_NOP 1
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: bb.2:
+ ; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
+ ; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
+ ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
+ bb.0:
+ successors: %bb.1(0x80000000)
+ S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+ SI_SPILL_V16_SAVE killed $vgpr0_hi16, %stack.1, $sgpr32, 0, implicit $exec :: (store (s16) into %stack.1, addrspace 5)
+ S_NOP 0, implicit renamable $vgpr0_lo16
+ SI_SPILL_V16_SAVE killed $vgpr0_lo16, %stack.0, $sgpr32, 0, implicit $exec :: (store (s16) into %stack.0, addrspace 5)
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ bb.1:
+ successors: %bb.2(0x80000000)
+ S_NOP 1
+ bb.2:
+ $vgpr0_lo16 = SI_SPILL_V16_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s16) from %stack.0, addrspace 5)
+ $vgpr0_hi16 = SI_SPILL_V16_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s16) from %stack.1, addrspace 5)
+ S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
+...
+
+---
+name: spill_restore_vgpr16_middle_of_block
+tracksRegLiveness: true
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
+ - { id: 1, name: '', type: spill-slot, offset: 4, size: 4, alignment: 4 }
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: $sgpr32
+ hasSpilledVGPRs: true
+body: |
+ ; EXPANDED-LABEL: name: spill_restore_vgpr16_middle_of_block
+ ; EXPANDED: bb.0:
+ ; EXPANDED-NEXT: successors: %bb.1(0x80000000)
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+ ; EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5)
+ ; EXPANDED-NEXT: S_NOP 0, implicit renamable $vgpr0_lo16
+ ; EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5)
+ ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: bb.1:
+ ; EXPANDED-NEXT: successors: %bb.2(0x80000000)
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: S_NOP 1
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: bb.2:
+ ; EXPANDED-NEXT: S_NOP 1
+ ; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
+ ; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
+ ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
+ bb.0:
+ successors: %bb.1(0x80000000)
+ S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+ SI_SPILL_V16_SAVE killed $vgpr0_hi16, %stack.1, $sgpr32, 0, implicit $exec :: (store (s16) into %stack.1, addrspace 5)
+ S_NOP 0, implicit renamable $vgpr0_lo16
+ SI_SPILL_V16_SAVE killed $vgpr0_lo16, %stack.0, $sgpr32, 0, implicit $exec :: (store (s16) into %stack.0, addrspace 5)
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ bb.1:
+ successors: %bb.2(0x80000000)
+ S_NOP 1
+ bb.2:
+ S_NOP 1
+ $vgpr0_lo16 = SI_SPILL_V16_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s16) from %stack.0, addrspace 5)
+ $vgpr0_hi16 = SI_SPILL_V16_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s16) from %stack.1, addrspace 5)
+ S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
+...
+
+---
+name: spill_restore_vgpr16_end_of_block
+tracksRegLiveness: true
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
+ - { id: 1, name: '', type: spill-slot, offset: 4, size: 4, alignment: 4 }
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: $sgpr32
+ hasSpilledVGPRs: true
+body: |
+ ; EXPANDED-LABEL: name: spill_restore_vgpr16_end_of_block
+ ; EXPANDED: bb.0:
+ ; EXPANDED-NEXT: successors: %bb.1(0x80000000)
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+ ; EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5)
+ ; EXPANDED-NEXT: S_NOP 0, implicit renamable $vgpr0_lo16
+ ; EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5)
+ ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: bb.1:
+ ; EXPANDED-NEXT: successors: %bb.2(0x80000000)
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: S_NOP 1
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: bb.2:
+ ; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
+ ; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
+ bb.0:
+ successors: %bb.1(0x80000000)
+ S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+ SI_SPILL_V16_SAVE killed $vgpr0_hi16, %stack.1, $sgpr32, 0, implicit $exec :: (store (s16) into %stack.1, addrspace 5)
+ S_NOP 0, implicit renamable $vgpr0_lo16
+ SI_SPILL_V16_SAVE killed $vgpr0_lo16, %stack.0, $sgpr32, 0, implicit $exec :: (store (s16) into %stack.0, addrspace 5)
+ S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ bb.1:
+ successors: %bb.2(0x80000000)
+ S_NOP 1
+ bb.2:
+ $vgpr0_lo16 = SI_SPILL_V16_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s16) from %stack.0, addrspace 5)
+ $vgpr0_hi16 = SI_SPILL_V16_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s16) from %stack.1, addrspace 5)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
new file mode 100644
index 0000000000000..0e45df223465d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -0,0 +1,391 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16
+
+define void @spill_i16_alu() {
+; GCN-TRUE16-LABEL: spill_i16_alu:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_i16_alu:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca i16, i32 1, align 4, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %a = load volatile i16, ptr addrspace(5) %aptr
+ %add = add i16 %a, 123
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %add, ptr addrspace(5) %outptr
+
+ ret void
+}
+
+define void @spill_i16_alu_two_vals() {
+; GCN-TRUE16-LABEL: spill_i16_alu_two_vals:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GCN-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_i16_alu_two_vals:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_b32 v1, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v1, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca i16, i32 1, align 4, addrspace(5)
+ %alloca2 = alloca i16, i32 1, align 4, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %a = load volatile i16, ptr addrspace(5) %aptr
+ %add = add i16 %a, 123
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %bptr = getelementptr i16, ptr addrspace(5) %alloca2, i32 0
+ %b = load volatile i16, ptr addrspace(5) %bptr
+ %badd = add i16 %b, 123
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %add, ptr addrspace(5) %outptr
+ %outptr2 = getelementptr i16, ptr addrspace(5) %alloca2, i32 0
+ store volatile i16 %badd, ptr addrspace(5) %outptr2
+
+ ret void
+}
+
+; Tests after this do not actually test 16 bit spills because there is no use of VGPR_16. They could demonstrate 16 bit spills if we update the instructions to use VGPR_16 instead of VGPR_32
+
+define void @spill_i16() {
+; GCN-TRUE16-LABEL: spill_i16:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_i16:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca i16, i32 1, align 4, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %a = load volatile i16, ptr addrspace(5) %aptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %a, ptr addrspace(5) %outptr
+
+ ret void
+}
+
+define void @spill_half() {
+; GCN-TRUE16-LABEL: spill_half:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_half:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca half, i32 1, align 4, addrspace(5)
+
+ %aptr = getelementptr half, ptr addrspace(5) %alloca, i32 0
+ %a = load volatile half, ptr addrspace(5) %aptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr half, ptr addrspace(5) %alloca, i32 0
+ store volatile half %a, ptr addrspace(5) %outptr
+
+ ret void
+}
+
+define void @spill_i16_from_v2i16() {
+; GCN-TRUE16-LABEL: spill_i16_from_v2i16:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_i16_from_v2i16:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ %a = load volatile i16, ptr addrspace(5) %aptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ store volatile i16 %a, ptr addrspace(5) %outptr
+
+ ret void
+}
+
+define void @spill_2xi16_from_v2i16() {
+; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_2xi16_from_v2i16:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ %a = load volatile i16, ptr addrspace(5) %aptr
+ %bptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %b = load volatile i16, ptr addrspace(5) %bptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ store volatile i16 %a, ptr addrspace(5) %outptr
+ %boutptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %b, ptr addrspace(5) %boutptr
+
+ ret void
+}
+
+define void @spill_2xi16_from_v2i16_one_free_reg() {
+; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v7, s32 offset:2 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ %a = load volatile i16, ptr addrspace(5) %aptr
+ %bptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %b = load volatile i16, ptr addrspace(5) %bptr
+
+ ; Force %a to spill.
+ ; Would not need to spill if the short scratch instructions used vgpr_16
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ store volatile i16 %a, ptr addrspace(5) %outptr
+ %boutptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %b, ptr addrspace(5) %boutptr
+
+ ret void
+}
+
+define void @spill_v2i16() {
+; GCN-LABEL: spill_v2i16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
+; GCN-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
+
+ %aptr = getelementptr <2 x i16>, ptr addrspace(5) %alloca, i32 1
+ %a = load volatile <2 x i16>, ptr addrspace(5) %aptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr <2 x i16>, ptr addrspace(5) %alloca, i32 1
+ store volatile <2 x i16> %a, ptr addrspace(5) %outptr
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.mir b/llvm/test/CodeGen/AMDGPU/spillv16.mir
new file mode 100644
index 0000000000000..05569bf394c43
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.mir
@@ -0,0 +1,58 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=SPILLED %s
+# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=regallocfast,prologepilog -o - %s | FileCheck -check-prefix=EXPANDED %s
+
+---
+name: spill_restore_vgpr16
+tracksRegLiveness: true
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: $sgpr32
+body: |
+ ; SPILLED-LABEL: name: spill_restore_vgpr16
+ ; SPILLED: bb.0:
+ ; SPILLED-NEXT: successors: %bb.1(0x80000000)
+ ; SPILLED-NEXT: {{ $}}
+ ; SPILLED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+ ; SPILLED-NEXT: SI_SPILL_V16_SAVE killed $vgpr0_hi16, %stack.1, $sgpr32, 0, implicit $exec :: (store (s16) into %stack.1, addrspace 5)
+ ; SPILLED-NEXT: SI_SPILL_V16_SAVE killed $vgpr0_lo16, %stack.0, $sgpr32, 0, implicit $exec :: (store (s16) into %stack.0, addrspace 5)
+ ; SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; SPILLED-NEXT: {{ $}}
+ ; SPILLED-NEXT: bb.1:
+ ; SPILLED-NEXT: successors: %bb.2(0x80000000)
+ ; SPILLED-NEXT: {{ $}}
+ ; SPILLED-NEXT: S_NOP 1
+ ; SPILLED-NEXT: {{ $}}
+ ; SPILLED-NEXT: bb.2:
+ ; SPILLED-NEXT: $vgpr0_lo16 = SI_SPILL_V16_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s16) from %stack.0, addrspace 5)
+ ; SPILLED-NEXT: $vgpr0_hi16 = SI_SPILL_V16_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s16) from %stack.1, addrspace 5)
+ ; SPILLED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
+ ;
+ ; EXPANDED-LABEL: name: spill_restore_vgpr16
+ ; EXPANDED: bb.0:
+ ; EXPANDED-NEXT: successors: %bb.1(0x80000000)
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+ ; EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, addrspace 5)
+ ; EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, addrspace 5)
+ ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: bb.1:
+ ; EXPANDED-NEXT: successors: %bb.2(0x80000000)
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: S_NOP 1
+ ; EXPANDED-NEXT: {{ $}}
+ ; EXPANDED-NEXT: bb.2:
+ ; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, addrspace 5)
+ ; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, addrspace 5)
+ ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
+ bb.0:
+ S_NOP 0, implicit-def %0:vgpr_16, implicit-def %1:vgpr_16
+ S_CBRANCH_SCC1 implicit undef $scc, %bb.1
+
+ bb.1:
+ S_NOP 1
+
+ bb.2:
+ S_NOP 0, implicit %0, implicit %1
+...
>From 26b979bf2a4c9884ff07dd12618cb765b2abd866 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 25 Feb 2025 17:07:43 -0500
Subject: [PATCH 2/2] added an assert for scratch flat
---
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index f3e1f183e8836..ba8e7e8fc9e29 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2392,6 +2392,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
unsigned Opc;
if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
+ assert(ST.enableFlatSratch() && "Flat Scratch is not enabled!");
Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
} else {
Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
@@ -2467,6 +2468,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
unsigned Opc;
if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
+ assert(ST.enableFlatSratch() && "Flat Scratch is not enabled!");
Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
} else {
Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
More information about the llvm-commits
mailing list