[llvm] [AMDGPU] Support true16 spill restore with sram-ecc (PR #165320)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 27 14:31:05 PDT 2025
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/165320
None
>From 211f4abedda46107492f79683992e6ca42aecf74 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 27 Oct 2025 14:30:10 -0700
Subject: [PATCH] [AMDGPU] Support true16 spill restore with sram-ecc
---
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 23 +-
llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir | 67 ++++++
llvm/test/CodeGen/AMDGPU/spillv16.ll | 235 ++++++++++++++++++++
llvm/test/CodeGen/AMDGPU/spillv16.mir | 22 ++
4 files changed, 346 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d80a6f339c8f6..2cb7d8ff1a532 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1823,6 +1823,16 @@ void SIRegisterInfo::buildSpillLoadStore(
}
}
+ Register FinalValueReg = ValueReg;
+ if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_UBYTE_SADDR) {
+ // If we are loading 16-bit value with SRAMECC endabled we need a temp
+ // 32-bit VGPR to load and extract 16-bits into the final register.
+ ValueReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
+ false, 0, false);
+ SubReg = ValueReg;
+ IsKill = false;
+ }
+
MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
MachineMemOperand *NewMMO =
MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
@@ -1863,6 +1873,15 @@ void SIRegisterInfo::buildSpillLoadStore(
MIB.addImm(0); // swz
MIB.addMemOperand(NewMMO);
+ if (FinalValueReg != ValueReg) {
+ // Extract 16-bit from the loaded 32-bit value.
+ ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
+ MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e32))
+ .addReg(FinalValueReg, getDefRegState(true))
+ .addReg(ValueReg, getKillRegState(true));
+ ValueReg = FinalValueReg;
+ }
+
if (!IsAGPR && NeedSuperRegDef)
MIB.addReg(ValueReg, RegState::ImplicitDefine);
@@ -2505,7 +2524,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
unsigned Opc;
if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
- Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
+ Opc = ST.d16PreservesUnusedBits()
+ ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
+ : AMDGPU::SCRATCH_LOAD_UBYTE_SADDR;
} else {
Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
diff --git a/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir b/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir
index 0c694d9f49e18..e88e5748be6c4 100644
--- a/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill_kill_v16.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=EXPANDED %s
+# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1250 -mattr=+real-true16 -run-pass=prologepilog -o - %s | FileCheck -check-prefix=SRAMECC-EXPANDED %s
---
name: spill_restore_vgpr16
@@ -31,6 +32,28 @@ body: |
; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
+ ;
+ ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16
+ ; SRAMECC-EXPANDED: bb.0:
+ ; SRAMECC-EXPANDED-NEXT: successors: %bb.1(0x80000000)
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+ ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit renamable $vgpr0_lo16
+ ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: bb.1:
+ ; SRAMECC-EXPANDED-NEXT: successors: %bb.2(0x80000000)
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 1
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: bb.2:
+ ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_UBYTE_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 killed $vgpr1_lo16, implicit $exec
+ ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_UBYTE_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 killed $vgpr1_lo16, implicit $exec
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
bb.0:
successors: %bb.1(0x80000000)
S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
@@ -78,6 +101,29 @@ body: |
; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
+ ;
+ ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16_middle_of_block
+ ; SRAMECC-EXPANDED: bb.0:
+ ; SRAMECC-EXPANDED-NEXT: successors: %bb.1(0x80000000)
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+ ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit renamable $vgpr0_lo16
+ ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: bb.1:
+ ; SRAMECC-EXPANDED-NEXT: successors: %bb.2(0x80000000)
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 1
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: bb.2:
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 1
+ ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_UBYTE_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 killed $vgpr1_lo16, implicit $exec
+ ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_UBYTE_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 killed $vgpr1_lo16, implicit $exec
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
bb.0:
successors: %bb.1(0x80000000)
S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
@@ -124,6 +170,27 @@ body: |
; EXPANDED-NEXT: bb.2:
; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
+ ;
+ ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16_end_of_block
+ ; SRAMECC-EXPANDED: bb.0:
+ ; SRAMECC-EXPANDED-NEXT: successors: %bb.1(0x80000000)
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+ ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, align 4, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit renamable $vgpr0_lo16
+ ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, align 4, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: bb.1:
+ ; SRAMECC-EXPANDED-NEXT: successors: %bb.2(0x80000000)
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 1
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: bb.2:
+ ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_UBYTE_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, align 4, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 killed $vgpr1_lo16, implicit $exec
+ ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_UBYTE_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, align 4, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 killed $vgpr1_lo16, implicit $exec
bb.0:
successors: %bb.1(0x80000000)
S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.ll b/llvm/test/CodeGen/AMDGPU/spillv16.ll
index 0e45df223465d..21059f7ef4a9d 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.ll
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-TRUE16
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,GCN-FAKE16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-TRUE16
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GFX1250,GFX1250-FAKE16
define void @spill_i16_alu() {
; GCN-TRUE16-LABEL: spill_i16_alu:
@@ -32,6 +34,41 @@ define void @spill_i16_alu() {
; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-TRUE16-LABEL: spill_i16_alu:
+; GFX1250-TRUE16: ; %bb.0: ; %entry
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 ; 2-byte Folded Spill
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: ;;#ASMSTART
+; GFX1250-TRUE16-NEXT: ;;#ASMEND
+; GFX1250-TRUE16-NEXT: scratch_load_u8 v1, off, s32 offset:2 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
+; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: spill_i16_alu:
+; GFX1250-FAKE16: ; %bb.0: ; %entry
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: ;;#ASMSTART
+; GFX1250-FAKE16-NEXT: ;;#ASMEND
+; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
entry:
%alloca = alloca i16, i32 1, align 4, addrspace(5)
@@ -88,6 +125,51 @@ define void @spill_i16_alu_two_vals() {
; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 dlc
; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-TRUE16-LABEL: spill_i16_alu_two_vals:
+; GFX1250-TRUE16: ; %bb.0: ; %entry
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:6 ; 2-byte Folded Spill
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: ;;#ASMSTART
+; GFX1250-TRUE16-NEXT: ;;#ASMEND
+; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_load_u8 v1, off, s32 offset:6 th:TH_LOAD_LU ; 2-byte Folded Reload
+; GFX1250-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x7b, v0.l
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
+; GFX1250-TRUE16-NEXT: scratch_store_d16_hi_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: spill_i16_alu_two_vals:
+; GFX1250-FAKE16: ; %bb.0: ; %entry
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: ;;#ASMSTART
+; GFX1250-FAKE16-NEXT: ;;#ASMEND
+; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_load_b32 v1, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-FAKE16-NEXT: v_add_nc_u16 v0, 0x7b, v0
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v1, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
entry:
%alloca = alloca i16, i32 1, align 4, addrspace(5)
%alloca2 = alloca i16, i32 1, align 4, addrspace(5)
@@ -140,6 +222,22 @@ define void @spill_i16() {
; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: spill_i16:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: ;;#ASMSTART
+; GFX1250-NEXT: ;;#ASMEND
+; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%alloca = alloca i16, i32 1, align 4, addrspace(5)
@@ -183,6 +281,22 @@ define void @spill_half() {
; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: spill_half:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:4 ; 4-byte Folded Spill
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: ;;#ASMSTART
+; GFX1250-NEXT: ;;#ASMEND
+; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%alloca = alloca half, i32 1, align 4, addrspace(5)
@@ -226,6 +340,22 @@ define void @spill_i16_from_v2i16() {
; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 dlc
; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: spill_i16_from_v2i16:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: ;;#ASMSTART
+; GFX1250-NEXT: ;;#ASMEND
+; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
@@ -283,6 +413,54 @@ define void @spill_2xi16_from_v2i16() {
; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16:
+; GFX1250-TRUE16: ; %bb.0: ; %entry
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_clause 0x1
+; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:12
+; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: ;;#ASMSTART
+; GFX1250-TRUE16-NEXT: ;;#ASMEND
+; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: spill_2xi16_from_v2i16:
+; GFX1250-FAKE16: ; %bb.0: ; %entry
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_clause 0x1
+; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8
+; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:12 ; 4-byte Folded Spill
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: ;;#ASMSTART
+; GFX1250-FAKE16-NEXT: ;;#ASMEND
+; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
entry:
%alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
@@ -341,6 +519,47 @@ define void @spill_2xi16_from_v2i16_one_free_reg() {
; GCN-FAKE16-NEXT: scratch_store_b16 off, v0, s32 dlc
; GCN-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-TRUE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GFX1250-TRUE16: ; %bb.0: ; %entry
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: ;;#ASMSTART
+; GFX1250-TRUE16-NEXT: ;;#ASMEND
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.l
+; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: spill_2xi16_from_v2i16_one_free_reg:
+; GFX1250-FAKE16: ; %bb.0: ; %entry
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_load_u16 v7, off, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: ;;#ASMSTART
+; GFX1250-FAKE16-NEXT: ;;#ASMEND
+; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v7, s32 offset:2 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
entry:
%alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
@@ -375,6 +594,22 @@ define void @spill_v2i16() {
; GCN-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; GCN-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: spill_v2i16:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:4 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:8 ; 4-byte Folded Spill
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: ;;#ASMSTART
+; GFX1250-NEXT: ;;#ASMEND
+; GFX1250-NEXT: scratch_load_b32 v0, off, s32 offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: scratch_store_b32 off, v0, s32 offset:4 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
entry:
%alloca = alloca <2 x i16>, i32 2, align 1, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/spillv16.mir b/llvm/test/CodeGen/AMDGPU/spillv16.mir
index 05569bf394c43..14b73af0dcedb 100644
--- a/llvm/test/CodeGen/AMDGPU/spillv16.mir
+++ b/llvm/test/CodeGen/AMDGPU/spillv16.mir
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=regallocfast -o - %s | FileCheck -check-prefix=SPILLED %s
# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1100 -mattr=+real-true16 -run-pass=regallocfast,prologepilog -o - %s | FileCheck -check-prefix=EXPANDED %s
+# RUN: llc -march=amdgcn -verify-machineinstrs -mcpu=gfx1250 -mattr=+real-true16 -run-pass=regallocfast,prologepilog -o - %s | FileCheck -check-prefix=SRAMECC-EXPANDED %s
---
name: spill_restore_vgpr16
@@ -46,6 +47,27 @@ body: |
; EXPANDED-NEXT: $vgpr0_lo16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, addrspace 5)
; EXPANDED-NEXT: $vgpr0_hi16 = SCRATCH_LOAD_SHORT_D16_SADDR_t16 $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, addrspace 5)
; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
+ ;
+ ; SRAMECC-EXPANDED-LABEL: name: spill_restore_vgpr16
+ ; SRAMECC-EXPANDED: bb.0:
+ ; SRAMECC-EXPANDED-NEXT: successors: %bb.1(0x80000000)
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit-def renamable $vgpr0_lo16, implicit-def renamable $vgpr0_hi16
+ ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_hi16, $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.1, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: SCRATCH_STORE_SHORT_SADDR_t16 killed $vgpr0_lo16, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into %stack.0, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: bb.1:
+ ; SRAMECC-EXPANDED-NEXT: successors: %bb.2(0x80000000)
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 1
+ ; SRAMECC-EXPANDED-NEXT: {{ $}}
+ ; SRAMECC-EXPANDED-NEXT: bb.2:
+ ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_UBYTE_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.0, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 killed $vgpr1_lo16, implicit $exec
+ ; SRAMECC-EXPANDED-NEXT: $vgpr1 = SCRATCH_LOAD_UBYTE_SADDR $sgpr32, 2, 0, implicit $exec, implicit $flat_scr :: (load (s16) from %stack.1, addrspace 5)
+ ; SRAMECC-EXPANDED-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 killed $vgpr1_lo16, implicit $exec
+ ; SRAMECC-EXPANDED-NEXT: S_NOP 0, implicit killed renamable $vgpr0_lo16, implicit killed renamable $vgpr0_hi16
bb.0:
S_NOP 0, implicit-def %0:vgpr_16, implicit-def %1:vgpr_16
S_CBRANCH_SCC1 implicit undef $scc, %bb.1
More information about the llvm-commits
mailing list