[llvm] [AMDGPU] Do not fold an immediate into instructions with frame indexes (PR #151263)
Changpeng Fang via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 5 11:21:41 PDT 2025
https://github.com/changpeng updated https://github.com/llvm/llvm-project/pull/151263
>From 162fd6bb67429283991ecad6ed6e0f223c32246b Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Tue, 29 Jul 2025 17:10:09 -0700
Subject: [PATCH 1/3] [AMDGPU] Do not fold an immediate into instructions with
frame indexes
Do not fold an immediate into an instruction that already has a frame
index operand. A frame index could possibly turn out to be another
immediate.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 9 +--
.../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 60 ++++++++++++++-----
llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 31 +++++++---
.../AMDGPU/fold-operands-frame-index.mir | 3 +-
.../CodeGen/AMDGPU/fold-sgpr-multi-imm.mir | 3 +-
.../CodeGen/AMDGPU/frame-index-elimination.ll | 3 +-
.../issue130120-eliminate-frame-index.ll | 26 +++++---
.../local-stack-alloc-block-sp-reference.ll | 27 ++++++---
8 files changed, 112 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c2da937552240..49b1683bf1abe 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6120,10 +6120,11 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
!Op.isIdenticalTo(*MO))
return false;
- // Do not fold a frame index into an instruction that already has a frame
- // index. The frame index handling code doesn't handle fixing up operand
- // constraints if there are multiple indexes.
- if (Op.isFI() && MO->isFI())
+ // Do not fold a frame index or an immediate into an instruction that
+ // already has a frame index. The frame index handling code doesn't handle
+ // fixing up operand constraints if there are multiple indexes, and a
+ // frame index could possibly turn out to be another immediate.
+ if (Op.isFI() && (MO->isFI() || MO->isImm()))
return false;
}
} else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index a066b15f84d6b..e6a8baceee020 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -1917,8 +1917,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3e80
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: s_movk_i32 s0, 0x3e84
+; GFX9-NEXT: s_add_i32 s0, s0, 4
; GFX9-NEXT: scratch_store_dword off, v0, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -1933,7 +1934,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_movk_i32 s0, 0x3e84
+; GFX10-NEXT: s_movk_i32 s0, 0x3e80
+; GFX10-NEXT: s_add_i32 s0, s0, 4
; GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -1945,10 +1947,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX942-LABEL: store_load_large_imm_offset_kernel:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: v_mov_b32_e32 v0, 13
+; GFX942-NEXT: s_movk_i32 s0, 0x3e80
; GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 15
-; GFX942-NEXT: s_movk_i32 s0, 0x3e84
+; GFX942-NEXT: s_add_i32 s0, s0, 4
; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -1958,7 +1961,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX11-LABEL: store_load_large_imm_offset_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT: s_movk_i32 s0, 0x3e84
+; GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s0, s0, 4
; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -1986,8 +1991,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2002,7 +2008,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2014,10 +2021,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_kernel:
; UNALIGNED_GFX942: ; %bb.0: ; %bb
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2027,7 +2035,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; UNALIGNED_GFX11-LABEL: store_load_large_imm_offset_kernel:
; UNALIGNED_GFX11: ; %bb.0: ; %bb
; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e84
+; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s0, 4
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -2061,11 +2071,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3e80
; GFX9-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-NEXT: s_add_i32 s1, s32, s0
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 15
-; GFX9-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX9-NEXT: s_add_i32 s0, s1, 4
; GFX9-NEXT: scratch_store_dword off, v0, s0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2076,8 +2088,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-NEXT: s_movk_i32 s0, 0x3e80
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX10-NEXT: s_add_i32 s1, s32, s0
+; GFX10-NEXT: s_add_i32 s0, s1, 4
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2089,11 +2103,13 @@ define void @store_load_large_imm_offset_foo() {
; GFX942-LABEL: store_load_large_imm_offset_foo:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_movk_i32 s0, 0x3e80
; GFX942-NEXT: v_mov_b32_e32 v0, 13
+; GFX942-NEXT: s_add_i32 s1, s32, s0
; GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, 15
-; GFX942-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX942-NEXT: s_add_i32 s0, s1, 4
; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2104,7 +2120,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT: s_add_i32 s0, s32, 0x3e84
+; GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s1, s32, s0
+; GFX11-NEXT: s_add_i32 s0, s1, 4
; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
@@ -2133,11 +2152,13 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX9-LABEL: store_load_large_imm_offset_foo:
; UNALIGNED_GFX9: ; %bb.0: ; %bb
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX9-NEXT: s_add_i32 s1, s32, s0
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0
; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc
@@ -2148,8 +2169,10 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX10: ; %bb.0: ; %bb
; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX10-NEXT: s_add_i32 s1, s32, s0
+; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0
@@ -2161,11 +2184,13 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_foo:
; UNALIGNED_GFX942: ; %bb.0: ; %bb
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13
+; UNALIGNED_GFX942-NEXT: s_add_i32 s1, s32, s0
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15
-; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0)
; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
@@ -2176,7 +2201,10 @@ define void @store_load_large_imm_offset_foo() {
; UNALIGNED_GFX11: ; %bb.0: ; %bb
; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s32, 0x3e84
+; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80
+; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; UNALIGNED_GFX11-NEXT: s_add_i32 s1, s32, s0
+; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s1, 4
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index b25d9b245f5f6..fc8883924dfbc 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -3621,7 +3621,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_movk_i32 s0, 0x3004
+; GFX9-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-NEXT: s_add_i32 s0, s0, 4
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -3637,7 +3638,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
; GFX10-NEXT: v_mov_b32_e32 v0, 13
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_movk_i32 s0, 0x3804
+; GFX10-NEXT: s_movk_i32 s0, 0x3800
+; GFX10-NEXT: s_add_i32 s0, s0, 4
; GFX10-NEXT: scratch_store_dword off, v0, off offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3682,7 +3684,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3004
+; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
@@ -3716,8 +3719,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13
; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800
; GFX1010-PAL-NEXT: s_mov_b32 s1, 0
-; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3804
+; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s1 offset:4
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3739,7 +3743,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13
; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15
-; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3804
+; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800
+; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4
; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3785,10 +3790,12 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-LABEL: store_load_large_imm_offset_foo:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s0, 0x3000
; GFX9-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-NEXT: s_add_i32 s1, s32, s0
; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_add_i32 s0, s32, 0x3004
+; GFX9-NEXT: s_add_i32 s0, s1, 4
; GFX9-NEXT: v_mov_b32_e32 v0, 15
; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -3800,8 +3807,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-NEXT: s_movk_i32 s0, 0x3800
; GFX10-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-NEXT: s_add_i32 s0, s32, 0x3804
+; GFX10-NEXT: s_add_i32 s1, s32, s0
+; GFX10-NEXT: s_add_i32 s0, s1, 4
; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
@@ -3843,10 +3852,12 @@ define void @store_load_large_imm_offset_foo() {
; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
; GFX9-PAL: ; %bb.0: ; %bb
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-PAL-NEXT: s_add_i32 s1, s32, s0
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x3004
+; GFX9-PAL-NEXT: s_add_i32 s0, s1, 4
; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
@@ -3872,8 +3883,10 @@ define void @store_load_large_imm_offset_foo() {
; GFX10-PAL: ; %bb.0: ; %bb
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
-; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x3804
+; GFX10-PAL-NEXT: s_add_i32 s1, s32, s0
+; GFX10-PAL-NEXT: s_add_i32 s0, s1, 4
; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
index 7fad2f466bc9f..a88b1ecc40cc9 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
@@ -75,7 +75,8 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_0
- ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 256, implicit-def $scc
+ ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 256
+ ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]], implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
index cc4314263bcba..2f2d727ee2c59 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
@@ -46,7 +46,8 @@ body: |
%2:sreg_32 = S_LSHL2_ADD_U32 %0, %1, implicit-def $scc
...
# GCN-LABEL: name: test_frameindex{{$}}
-# GCN: %1:sreg_32 = S_ADD_I32 %stack.0, 70
+# GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 70
+# GCN-NEXT: %1:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]]
---
name: test_frameindex
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 15cda622b902d..f2fe61f5376e4 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -360,7 +360,8 @@ entry:
; s_add_i32.
; GCN-LABEL: {{^}}fi_sop2_s_add_u32_literal_error:
-; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0, 0x2010
+; GCN: s_movk_i32 [[S_MOVK_I32_:s[0-9]+]], 0x1000
+; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0x1010, [[S_MOVK_I32_]]
; GCN: s_addc_u32 [[ADD_HI:s[0-9]+]], s{{[0-9]+}}, 0
define amdgpu_kernel void @fi_sop2_s_add_u32_literal_error() #0 {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
index 1c298014e33e7..300124848c1aa 100644
--- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
@@ -6,16 +6,24 @@ define amdgpu_gfx [13 x i32] @issue130120() {
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-NEXT: s_add_i32 s0, s32, 0xf0
-; CHECK-NEXT: s_add_i32 s1, s32, 0xf4
-; CHECK-NEXT: s_add_i32 s2, s32, 0xf8
-; CHECK-NEXT: s_add_i32 s3, s32, 0xfc
+; CHECK-NEXT: s_movk_i32 s1, 0xf4
+; CHECK-NEXT: s_movk_i32 s2, 0xf8
+; CHECK-NEXT: s_movk_i32 s3, 0xfc
+; CHECK-NEXT: s_movk_i32 s34, 0x100
; CHECK-NEXT: v_mov_b32_e32 v1, v0
-; CHECK-NEXT: s_add_i32 s34, s32, 0x100
-; CHECK-NEXT: s_add_i32 s35, s32, 0x104
-; CHECK-NEXT: s_add_i32 s36, s32, 0x108
-; CHECK-NEXT: s_add_i32 s37, s32, 0x110
-; CHECK-NEXT: s_add_i32 s38, s32, 0x120
+; CHECK-NEXT: s_movk_i32 s35, 0x104
+; CHECK-NEXT: s_movk_i32 s36, 0x108
+; CHECK-NEXT: s_movk_i32 s37, 0x110
+; CHECK-NEXT: s_movk_i32 s38, 0x120
+; CHECK-NEXT: s_add_i32 s0, s32, 0xf0
+; CHECK-NEXT: s_add_i32 s1, s32, s1
+; CHECK-NEXT: s_add_i32 s2, s32, s2
+; CHECK-NEXT: s_add_i32 s3, s32, s3
+; CHECK-NEXT: s_add_i32 s34, s32, s34
+; CHECK-NEXT: s_add_i32 s35, s32, s35
+; CHECK-NEXT: s_add_i32 s36, s32, s36
+; CHECK-NEXT: s_add_i32 s37, s32, s37
+; CHECK-NEXT: s_add_i32 s38, s32, s38
; CHECK-NEXT: s_or_b32 s39, s32, 4
; CHECK-NEXT: s_or_b32 s40, s32, 8
; CHECK-NEXT: s_or_b32 s41, s32, 12
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index a3ebaec4811a9..5f0ca7bc42ae0 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -74,7 +74,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1
; FLATSCR-NEXT: ; %bb.2: ; %split
-; FLATSCR-NEXT: s_movk_i32 s0, 0x5000
+; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
+; FLATSCR-NEXT: s_addk_i32 s0, 0x3000
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_movk_i32 s0, 0x3000
@@ -175,7 +176,9 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1
; FLATSCR-NEXT: ; %bb.2: ; %split
-; FLATSCR-NEXT: s_add_i32 s0, s33, 0x5000
+; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
+; FLATSCR-NEXT: s_add_i32 s1, s33, s0
+; FLATSCR-NEXT: s_add_i32 s0, s1, 0x3000
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000
@@ -223,30 +226,35 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1
; MUBUF-NEXT: ; %bb.2: ; %split
+; MUBUF-NEXT: s_movk_i32 s5, 0x12d4
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
-; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d4, v1
+; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1
+; MUBUF-NEXT: s_movk_i32 s5, 0x12d0
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
; MUBUF-NEXT: s_movk_i32 s4, 0x4000
; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d0, v1
+; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1
+; MUBUF-NEXT: s_movk_i32 s5, 0x12c4
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
; MUBUF-NEXT: s_or_b32 s4, s4, 0x12c0
; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c4, v1
-; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000
+; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1
; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v0, s4
-; MUBUF-NEXT: v_or_b32_e32 v2, 0x12cc, v3
+; MUBUF-NEXT: s_movk_i32 s4, 0x12cc
+; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000
+; MUBUF-NEXT: v_or_b32_e32 v2, s4, v3
+; MUBUF-NEXT: s_movk_i32 s4, 0x12c8
; MUBUF-NEXT: v_mov_b32_e32 v6, 0x4000
; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v7, 0x4000
; MUBUF-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v2, 0x12c8, v6
+; MUBUF-NEXT: v_or_b32_e32 v2, s4, v6
; MUBUF-NEXT: v_mov_b32_e32 v8, 0x4000
; MUBUF-NEXT: v_mov_b32_e32 v9, 0x4000
; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen glc
@@ -298,7 +306,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1
; FLATSCR-NEXT: ; %bb.2: ; %split
-; FLATSCR-NEXT: s_movk_i32 s0, 0x3000
+; FLATSCR-NEXT: s_movk_i32 s0, 0x1000
+; FLATSCR-NEXT: s_addk_i32 s0, 0x2000
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc
>From 067d960fc1ccfec765e6f08be54af774dcf71b71 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Wed, 30 Jul 2025 12:19:43 -0700
Subject: [PATCH 2/3] [AMDGPU] Add the new test that fails otherwise
---
.../AMDGPU/no-folding-imm-to-inst-with-fi.ll | 114 ++++++++++++++++++
1 file changed, 114 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
new file mode 100644
index 0000000000000..c868b2f97be8e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+; will assert otherwise.
+
+define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<16 x i64> %arg) {
+; CHECK-LABEL: no_folding_imm_to_inst_with_fi:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: s_load_b512 s[16:31], s[4:5], 0x64
+; CHECK-NEXT: s_load_b512 s[0:15], s[4:5], 0x24
+; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base
+; CHECK-NEXT: s_movk_i32 s33, 0x70
+; CHECK-NEXT: s_movk_i32 s34, 0x60
+; CHECK-NEXT: v_or_b32_e64 v0, 0, 16
+; CHECK-NEXT: s_or_b32 s36, 0x80, s33
+; CHECK-NEXT: s_mov_b32 s37, s35
+; CHECK-NEXT: v_dual_mov_b32 v1, s35 :: v_dual_mov_b32 v20, s36
+; CHECK-NEXT: s_or_b32 s38, 0x80, s34
+; CHECK-NEXT: s_mov_b32 s39, s35
+; CHECK-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v13, s35
+; CHECK-NEXT: v_mov_b32_e32 v21, s37
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: flat_store_b128 v[0:1], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: s_movk_i32 s34, 0x80
+; CHECK-NEXT: v_mov_b32_e32 v22, s38
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: v_dual_mov_b32 v34, s34 :: v_dual_mov_b32 v23, s39
+; CHECK-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v7, s27
+; CHECK-NEXT: s_movk_i32 s20, 0x50
+; CHECK-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; CHECK-NEXT: s_mov_b32 s21, s35
+; CHECK-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29
+; CHECK-NEXT: v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v3, s31
+; CHECK-NEXT: v_mov_b32_e32 v4, s24
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_or_b32 s20, 0x80, s20
+; CHECK-NEXT: v_dual_mov_b32 v5, s25 :: v_dual_mov_b32 v6, s26
+; CHECK-NEXT: v_mov_b32_e32 v25, s21
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v24, s20
+; CHECK-NEXT: flat_store_b128 v[12:13], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[20:21], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[22:23], v[4:7] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[24:25], v[8:11] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v2, s18
+; CHECK-NEXT: s_mov_b32 s17, s35
+; CHECK-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v6, s14
+; CHECK-NEXT: s_mov_b32 s13, s35
+; CHECK-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v10, s10
+; CHECK-NEXT: s_mov_b32 s9, s35
+; CHECK-NEXT: v_mov_b32_e32 v0, s16
+; CHECK-NEXT: s_or_b32 s16, 0x80, 64
+; CHECK-NEXT: v_dual_mov_b32 v13, s5 :: v_dual_mov_b32 v14, s6
+; CHECK-NEXT: s_mov_b32 s5, s35
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: v_dual_mov_b32 v27, s17 :: v_dual_mov_b32 v4, s12
+; CHECK-NEXT: s_or_b32 s12, 0x80, 48
+; CHECK-NEXT: v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v8, s8
+; CHECK-NEXT: s_or_b32 s8, 0x80, 32
+; CHECK-NEXT: v_dual_mov_b32 v31, s9 :: v_dual_mov_b32 v12, s4
+; CHECK-NEXT: s_or_b32 s4, 0x80, 16
+; CHECK-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v26, s16
+; CHECK-NEXT: v_dual_mov_b32 v33, s5 :: v_dual_mov_b32 v16, s0
+; CHECK-NEXT: v_mov_b32_e32 v19, s3
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: v_dual_mov_b32 v7, s15 :: v_dual_mov_b32 v28, s12
+; CHECK-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v30, s8
+; CHECK-NEXT: v_dual_mov_b32 v15, s7 :: v_dual_mov_b32 v32, s4
+; CHECK-NEXT: v_mov_b32_e32 v35, s35
+; CHECK-NEXT: v_dual_mov_b32 v17, s1 :: v_dual_mov_b32 v18, s2
+; CHECK-NEXT: flat_store_b128 v[26:27], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[28:29], v[4:7] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[30:31], v[8:11] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[32:33], v[12:15] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_store_b128 v[34:35], v[16:19] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[22:23] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[24:25] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[30:31] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[28:29] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[34:35] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: flat_load_b128 v[0:3], v[32:33] scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: s_endpgm
+bb:
+ %alloca = alloca <4 x i64>, align 32, addrspace(5)
+ %alloca1 = alloca <16 x i64>, align 128, addrspace(5)
+ %addrspacecast = addrspacecast ptr addrspace(5) %alloca to ptr
+ store volatile <4 x i64> poison, ptr %addrspacecast, align 32
+ %addrspacecast2 = addrspacecast ptr addrspace(5) %alloca1 to ptr
+ store volatile <16 x i64> %arg, ptr %addrspacecast2, align 128
+ %addrspacecast3 = addrspacecast ptr addrspace(5) %alloca1 to ptr
+ %load = load volatile <16 x i64>, ptr %addrspacecast3, align 128
+ ret void
+}
>From 392591b69d4b5aa41c17791f4a7464786ba17ec9 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Tue, 5 Aug 2025 11:16:13 -0700
Subject: [PATCH 3/3] [AMDGPU] Update the patch based on the comments
1) extend the restriction to all !isReg: the same issue probably
exists for any exotic operand types.
2) Update the newly added LIT test:
-- remove unnecessary addrespacecasts
-- store value instead of posion
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 10 +-
.../AMDGPU/no-folding-imm-to-inst-with-fi.ll | 112 +++++++++---------
2 files changed, 58 insertions(+), 64 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 49b1683bf1abe..ff9f40bac02ea 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6120,11 +6120,11 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
!Op.isIdenticalTo(*MO))
return false;
- // Do not fold a frame index or an immediate into an instruction that
- // already has a frame index. The frame index handling code doesn't handle
- // fixing up operand constraints if there are multiple indexes, and a
- // frame index could possibly turn out to be another immediate.
- if (Op.isFI() && (MO->isFI() || MO->isImm()))
+ // Do not fold a non-inlineable and non-register operand into an
+ // instruction that already has a frame index. The frame index handling
+ // code could not handle well when a fame index co-exists with another
+ // non-register operand, unless that operand is a inlineable immediate.
+ if (Op.isFI())
return false;
}
} else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() &&
diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
index c868b2f97be8e..6d0aa1e784530 100644
--- a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
@@ -1,79 +1,75 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
-; will assert otherwise.
-
-define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<16 x i64> %arg) {
+define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %val4, <16 x i64> %val16) {
; CHECK-LABEL: no_folding_imm_to_inst_with_fi:
; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_clause 0x1
-; CHECK-NEXT: s_load_b512 s[16:31], s[4:5], 0x64
-; CHECK-NEXT: s_load_b512 s[0:15], s[4:5], 0x24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: s_load_b256 s[36:43], s[4:5], 0x24
+; CHECK-NEXT: s_load_b512 s[16:31], s[4:5], 0xe4
+; CHECK-NEXT: s_load_b512 s[0:15], s[4:5], 0xa4
; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base
; CHECK-NEXT: s_movk_i32 s33, 0x70
; CHECK-NEXT: s_movk_i32 s34, 0x60
-; CHECK-NEXT: v_or_b32_e64 v0, 0, 16
-; CHECK-NEXT: s_or_b32 s36, 0x80, s33
-; CHECK-NEXT: s_mov_b32 s37, s35
-; CHECK-NEXT: v_dual_mov_b32 v1, s35 :: v_dual_mov_b32 v20, s36
-; CHECK-NEXT: s_or_b32 s38, 0x80, s34
-; CHECK-NEXT: s_mov_b32 s39, s35
-; CHECK-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v13, s35
-; CHECK-NEXT: v_mov_b32_e32 v21, s37
+; CHECK-NEXT: s_or_b32 s44, 0x80, s33
+; CHECK-NEXT: s_mov_b32 s45, s35
+; CHECK-NEXT: s_or_b32 s46, 0x80, s34
+; CHECK-NEXT: s_mov_b32 s47, s35
+; CHECK-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45
+; CHECK-NEXT: v_dual_mov_b32 v22, s46 :: v_dual_mov_b32 v23, s47
+; CHECK-NEXT: s_movk_i32 s34, 0x80
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: v_dual_mov_b32 v34, s34 :: v_dual_mov_b32 v35, s35
; CHECK-NEXT: s_wait_kmcnt 0x0
-; CHECK-NEXT: flat_store_b128 v[0:1], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41
+; CHECK-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; CHECK-NEXT: v_dual_mov_b32 v4, s36 :: v_dual_mov_b32 v5, s37
+; CHECK-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v7, s39
+; CHECK-NEXT: scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: s_movk_i32 s34, 0x80
-; CHECK-NEXT: v_mov_b32_e32 v22, s38
-; CHECK-NEXT: s_wait_alu 0xfffe
-; CHECK-NEXT: v_dual_mov_b32 v34, s34 :: v_dual_mov_b32 v23, s39
-; CHECK-NEXT: v_dual_mov_b32 v8, s20 :: v_dual_mov_b32 v7, s27
+; CHECK-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21
; CHECK-NEXT: s_movk_i32 s20, 0x50
-; CHECK-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
-; CHECK-NEXT: s_mov_b32 s21, s35
-; CHECK-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v1, s29
-; CHECK-NEXT: v_dual_mov_b32 v2, s30 :: v_dual_mov_b32 v3, s31
-; CHECK-NEXT: v_mov_b32_e32 v4, s24
+; CHECK-NEXT: v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29
+; CHECK-NEXT: v_dual_mov_b32 v10, s30 :: v_dual_mov_b32 v11, s31
; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: s_or_b32 s20, 0x80, s20
-; CHECK-NEXT: v_dual_mov_b32 v5, s25 :: v_dual_mov_b32 v6, s26
-; CHECK-NEXT: v_mov_b32_e32 v25, s21
+; CHECK-NEXT: s_mov_b32 s21, s35
+; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
+; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
+; CHECK-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s23
; CHECK-NEXT: s_wait_alu 0xfffe
-; CHECK-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v24, s20
-; CHECK-NEXT: flat_store_b128 v[12:13], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: v_dual_mov_b32 v25, s21 :: v_dual_mov_b32 v24, s20
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: flat_store_b128 v[20:21], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: flat_store_b128 v[22:23], v[4:7] scope:SCOPE_SYS
+; CHECK-NEXT: flat_store_b128 v[22:23], v[12:15] scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: flat_store_b128 v[24:25], v[8:11] scope:SCOPE_SYS
+; CHECK-NEXT: flat_store_b128 v[24:25], v[0:3] scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v2, s18
-; CHECK-NEXT: s_mov_b32 s17, s35
-; CHECK-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v6, s14
-; CHECK-NEXT: s_mov_b32 s13, s35
-; CHECK-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v10, s10
-; CHECK-NEXT: s_mov_b32 s9, s35
-; CHECK-NEXT: v_mov_b32_e32 v0, s16
+; CHECK-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17
; CHECK-NEXT: s_or_b32 s16, 0x80, 64
-; CHECK-NEXT: v_dual_mov_b32 v13, s5 :: v_dual_mov_b32 v14, s6
-; CHECK-NEXT: s_mov_b32 s5, s35
-; CHECK-NEXT: s_wait_alu 0xfffe
-; CHECK-NEXT: v_dual_mov_b32 v27, s17 :: v_dual_mov_b32 v4, s12
+; CHECK-NEXT: s_mov_b32 s17, s35
+; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
; CHECK-NEXT: s_or_b32 s12, 0x80, 48
-; CHECK-NEXT: v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v8, s8
+; CHECK-NEXT: s_mov_b32 s13, s35
+; CHECK-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
; CHECK-NEXT: s_or_b32 s8, 0x80, 32
-; CHECK-NEXT: v_dual_mov_b32 v31, s9 :: v_dual_mov_b32 v12, s4
+; CHECK-NEXT: s_mov_b32 s9, s35
+; CHECK-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
; CHECK-NEXT: s_or_b32 s4, 0x80, 16
-; CHECK-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v26, s16
-; CHECK-NEXT: v_dual_mov_b32 v33, s5 :: v_dual_mov_b32 v16, s0
-; CHECK-NEXT: v_mov_b32_e32 v19, s3
+; CHECK-NEXT: s_mov_b32 s5, s35
+; CHECK-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
; CHECK-NEXT: s_wait_alu 0xfffe
-; CHECK-NEXT: v_dual_mov_b32 v7, s15 :: v_dual_mov_b32 v28, s12
-; CHECK-NEXT: v_dual_mov_b32 v11, s11 :: v_dual_mov_b32 v30, s8
-; CHECK-NEXT: v_dual_mov_b32 v15, s7 :: v_dual_mov_b32 v32, s4
-; CHECK-NEXT: v_mov_b32_e32 v35, s35
-; CHECK-NEXT: v_dual_mov_b32 v17, s1 :: v_dual_mov_b32 v18, s2
+; CHECK-NEXT: v_dual_mov_b32 v27, s17 :: v_dual_mov_b32 v26, s16
+; CHECK-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
+; CHECK-NEXT: v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v28, s12
+; CHECK-NEXT: v_dual_mov_b32 v31, s9 :: v_dual_mov_b32 v30, s8
+; CHECK-NEXT: v_dual_mov_b32 v33, s5 :: v_dual_mov_b32 v32, s4
+; CHECK-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; CHECK-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7
+; CHECK-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
+; CHECK-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
; CHECK-NEXT: flat_store_b128 v[26:27], v[0:3] scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: flat_store_b128 v[28:29], v[4:7] scope:SCOPE_SYS
@@ -104,11 +100,9 @@ define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<16 x i64> %
bb:
%alloca = alloca <4 x i64>, align 32, addrspace(5)
%alloca1 = alloca <16 x i64>, align 128, addrspace(5)
- %addrspacecast = addrspacecast ptr addrspace(5) %alloca to ptr
- store volatile <4 x i64> poison, ptr %addrspacecast, align 32
- %addrspacecast2 = addrspacecast ptr addrspace(5) %alloca1 to ptr
- store volatile <16 x i64> %arg, ptr %addrspacecast2, align 128
- %addrspacecast3 = addrspacecast ptr addrspace(5) %alloca1 to ptr
- %load = load volatile <16 x i64>, ptr %addrspacecast3, align 128
+ store volatile <4 x i64> %val4, ptr addrspace(5) %alloca
+ %ascast = addrspacecast ptr addrspace(5) %alloca1 to ptr
+ store volatile <16 x i64> %val16, ptr %ascast
+ %load = load volatile <16 x i64>, ptr %ascast
ret void
}
More information about the llvm-commits
mailing list