[llvm] [AMDGPU][True16][CodeGen] 16bit spill support in true16 mode (PR #128060)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 24 13:30:14 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
Enables 16-bit values to be spilled to scratch.
Note, the memory instructions used are defined as reading and writing VGPR_32, but do not clobber the unspecified 16-bits of those registers, and so spills and reloads of lo and hi halves of the registers work.
---
Patch is 25.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128060.diff
6 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+1)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+19-4)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+2)
- (added) llvm/test/CodeGen/AMDGPU/spillv16.ll (+391)
- (added) llvm/test/CodeGen/AMDGPU/spillv16.mir (+58)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2cf6de73fa90c..7ecb089373692 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1580,6 +1580,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
switch (Size) {
+ case 2:
+ return AMDGPU::SI_SPILL_V16_SAVE;
case 4:
return AMDGPU::SI_SPILL_V32_SAVE;
case 8:
@@ -1807,6 +1809,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
switch (Size) {
+ case 2:
+ return AMDGPU::SI_SPILL_V16_RESTORE;
case 4:
return AMDGPU::SI_SPILL_V32_RESTORE;
case 8:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cca49ee80a60e..4ec13807dc4d8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1002,6 +1002,7 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> {
} // End UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM]
}
+defm SI_SPILL_V16 : SI_SPILL_VGPR <VGPR_16>;
defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>;
defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 924aa45559366..f3e1f183e8836 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1280,6 +1280,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
+ case AMDGPU::SI_SPILL_V16_SAVE:
+ case AMDGPU::SI_SPILL_V16_RESTORE:
return 1;
default: llvm_unreachable("Invalid spill opcode");
}
@@ -2347,6 +2349,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V64_SAVE:
case AMDGPU::SI_SPILL_V32_SAVE:
+ case AMDGPU::SI_SPILL_V16_SAVE:
case AMDGPU::SI_SPILL_A1024_SAVE:
case AMDGPU::SI_SPILL_A512_SAVE:
case AMDGPU::SI_SPILL_A384_SAVE:
@@ -2387,8 +2390,14 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
MFI->getStackPtrOffsetReg());
- unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
- : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ unsigned Opc;
+ if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
+ Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
+ } else {
+ Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+ : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ }
+
auto *MBB = MI->getParent();
bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
if (IsWWMRegSpill) {
@@ -2406,6 +2415,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
MI->eraseFromParent();
return true;
}
+ case AMDGPU::SI_SPILL_V16_RESTORE:
case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_V64_RESTORE:
case AMDGPU::SI_SPILL_V96_RESTORE:
@@ -2455,8 +2465,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
MFI->getStackPtrOffsetReg());
- unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
- : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ unsigned Opc;
+ if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
+ Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
+ } else {
+ Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+ : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ }
auto *MBB = MI->getParent();
bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
if (IsWWMRegSpill) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c521d0dd3ad2d..6a92e54b69edc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2483,6 +2483,8 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
// (move from MC* level to Target* level). Return size in bits.
unsigned getRegBitWidth(unsigned RCID) {
switch (RCID) {
+ case AMDGPU::VGPR_16RegClassID:
+ case AMDGPU::VGPR_16_Lo128RegClassID:
case AMDGPU::SGPR_LO16RegClassID:
case AMDGPU::AGPR_LO16RegClassID:
return 16;
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
new file mode 100644
index 0000000000000..0e45df223465d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -0,0 +1,391 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16
+
+define void @spill_i16_alu() {
+; GCN-TRUE16-LABEL: spill_i16_alu:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_i16_alu:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca i16, i32 1, align 4, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %a = load volatile i16, ptr addrspace(5) %aptr
+ %add = add i16 %a, 123
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %add, ptr addrspace(5) %outptr
+
+ ret void
+}
+
+define void @spill_i16_alu_two_vals() {
+; GCN-TRUE16-LABEL: spill_i16_alu_two_vals:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:4 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, s32 offset:6 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GCN-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_i16_alu_two_vals:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_b32 v1, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v1, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca i16, i32 1, align 4, addrspace(5)
+ %alloca2 = alloca i16, i32 1, align 4, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %a = load volatile i16, ptr addrspace(5) %aptr
+ %add = add i16 %a, 123
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %bptr = getelementptr i16, ptr addrspace(5) %alloca2, i32 0
+ %b = load volatile i16, ptr addrspace(5) %bptr
+ %badd = add i16 %b, 123
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %add, ptr addrspace(5) %outptr
+ %outptr2 = getelementptr i16, ptr addrspace(5) %alloca2, i32 0
+ store volatile i16 %badd, ptr addrspace(5) %outptr2
+
+ ret void
+}
+
+; Tests after this do not actually test 16 bit spills because there is no use of VGPR_16. They could demonstrate 16 bit spills if we update the instructions to use VGPR_16 instead of VGPR_32
+
+define void @spill_i16() {
+; GCN-TRUE16-LABEL: spill_i16:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_i16:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca i16, i32 1, align 4, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %a = load volatile i16, ptr addrspace(5) %aptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %a, ptr addrspace(5) %outptr
+
+ ret void
+}
+
+define void @spill_half() {
+; GCN-TRUE16-LABEL: spill_half:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_half:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca half, i32 1, align 4, addrspace(5)
+
+ %aptr = getelementptr half, ptr addrspace(5) %alloca, i32 0
+ %a = load volatile half, ptr addrspace(5) %aptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr half, ptr addrspace(5) %alloca, i32 0
+ store volatile half %a, ptr addrspace(5) %outptr
+
+ ret void
+}
+
+define void @spill_i16_from_v2i16() {
+; GCN-TRUE16-LABEL: spill_i16_from_v2i16:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_i16_from_v2i16:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ %a = load volatile i16, ptr addrspace(5) %aptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ store volatile i16 %a, ptr addrspace(5) %outptr
+
+ ret void
+}
+
+define void @spill_2xi16_from_v2i16() {
+; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_2xi16_from_v2i16:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 ; 4-byte Folded Spill
+; GCN-FAKE16-NEXT: ;;#ASMSTART
+; GCN-FAKE16-NEXT: ;;#ASMEND
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 ; 4-byte Folded Reload
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
+
+ %aptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ %a = load volatile i16, ptr addrspace(5) %aptr
+ %bptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ %b = load volatile i16, ptr addrspace(5) %bptr
+
+ ; Force %a to spill.
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
+
+ %outptr = getelementptr i16, ptr addrspace(5) %alloca, i32 1
+ store volatile i16 %a, ptr addrspace(5) %outptr
+ %boutptr = getelementptr i16, ptr addrspace(5) %alloca, i32 0
+ store volatile i16 %b, ptr addrspace(5) %boutptr
+
+ ret void
+}
+
+define void @spill_2xi16_from_v2i16_one_free_reg() {
+; GCN-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GCN-TRUE16: ; %bb.0: ; %entry
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:2 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:8 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 glc dlc
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:10 ; 2-byte Folded Spill
+; GCN-TRUE16-NEXT: ;;#ASMSTART
+; GCN-TRUE16-NEXT: ;;#ASMEND
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:8 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: scratch_load_d16_b16 v0, off, s32 offset:10 ; 2-byte Folded Reload
+; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GCN-TRUE16-NEXT: scratch_store_b16 off, v0, s32 dlc
+; GCN-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-FAKE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GCN-FAKE16: ; %bb.0: ; %entry
+; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-FAKE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 glc...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/128060
More information about the llvm-commits
mailing list