[llvm-branch-commits] [llvm] AMDGPU: Handle folding frame indexes into add with immediate (PR #110738)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Oct 1 22:25:00 PDT 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/110738
>From 295561a4936e932aa41fe7d80ec2aafb94f49a8e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 1 Oct 2024 23:53:51 +0400
Subject: [PATCH 1/2] AMDGPU: Handle folding frame indexes into add with
immediate
Frame index materialization can fold the constant offset into
adds with immediates. The mubuf expansion is more complicated because
we have to also insert the shift, so restrict this to one use for now.
This is preparation to avoid regressions in a future patch.
This also misses some cases due to visitation order. It depends on
the immediate already folding into the instruction.
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 14 ++++++++++++++
llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 6 ++----
.../AMDGPU/fold-operands-frame-index.gfx10.mir | 3 +--
.../AMDGPU/fold-operands-frame-index.mir | 18 ++++++------------
.../materialize-frame-index-sgpr.gfx10.ll | 6 +++---
.../AMDGPU/materialize-frame-index-sgpr.ll | 12 ++++++------
6 files changed, 32 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 1e2c77b08b9a63..fea84247f5ad88 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -194,6 +194,20 @@ bool SIFoldOperandsImpl::frameIndexMayFold(
return false;
const unsigned Opc = UseMI.getOpcode();
+ switch (Opc) {
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::V_ADD_U32_e32:
+ case AMDGPU::V_ADD_CO_U32_e32:
+ // TODO: Handle e64 variants
+ // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
+ // to insert the wave size shift at every point we use the index.
+ // TODO: Fix depending on visit order to fold immediates into the operand
+ return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
+ MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
+ default:
+ break;
+ }
+
if (TII->isMUBUF(UseMI))
return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
if (!TII->isFLATScratch(UseMI))
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index ef9590b3fd33fa..af0b6360527016 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -4705,8 +4705,7 @@ define amdgpu_ps void @large_offset() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_movk_i32 s0, 0x810
-; GFX10-NEXT: s_addk_i32 s0, 0x3c0
+; GFX10-NEXT: s_movk_i32 s0, 0xbd0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v0
@@ -4823,8 +4822,7 @@ define amdgpu_ps void @large_offset() {
; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-PAL-NEXT: s_movk_i32 s0, 0x810
-; GFX10-PAL-NEXT: s_addk_i32 s0, 0x3c0
+; GFX10-PAL-NEXT: s_movk_i32 s0, 0xbd0
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir
index 0d6511cbfceb21..b46c672ea8edd2 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir
@@ -13,8 +13,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__v_add_u32_e32__const_v_fi
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_MOV_B32_e32_]], implicit $exec
+ ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, %stack.0, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_U32_e32_]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
index aa91a4f9f988fc..97ba179afbbe03 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
@@ -14,8 +14,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_const
- ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
- ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_MOV_B32_]], 128, implicit-def $scc
+ ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 128, implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
@@ -35,8 +34,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__const_fi
- ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
- ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 128, [[S_MOV_B32_]], implicit-def $scc
+ ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 128, %stack.0, implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
@@ -56,8 +54,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__materializedconst_fi
- ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
- ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, [[S_MOV_B32_]], implicit-def $scc
+ ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 256
@@ -101,8 +98,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_1
- ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
- ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, [[S_MOV_B32_]], implicit-def $scc
+ ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 256
@@ -173,8 +169,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__v_add_u32_e32__const_v_fi
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_MOV_B32_e32_]], implicit $exec
+ ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, %stack.0, implicit $exec
; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
@@ -278,8 +273,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e32__const_v_fi
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 128, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec
+ ; CHECK: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 128, %stack.0, implicit-def $vcc, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e32_]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index 8a789a4c6cda9b..7708c0e4767cf2 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -1550,10 +1550,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32
; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3ec, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, vcc_lo, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3ec, v1
; GFX8-NEXT: v_writelane_b32 v2, s59, 0
; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32
; GFX8-NEXT: v_readfirstlane_b32 s59, v0
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index e9cd94620a6b9a..54b3f97d456820 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -1582,12 +1582,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6
+; GFX7-NEXT: v_lshr_b32_e64 v1, s32, 6
; GFX7-NEXT: v_writelane_b32 v22, vcc_lo, 0
; GFX7-NEXT: v_writelane_b32 v22, vcc_hi, 1
; GFX7-NEXT: s_movk_i32 vcc_lo, 0x4040
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, vcc_lo, v0
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x200, v0
+; GFX7-NEXT: v_add_i32_e32 v1, vcc, vcc_lo, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x200, v1
; GFX7-NEXT: v_writelane_b32 v23, s59, 27
; GFX7-NEXT: v_readfirstlane_b32 s59, v0
; GFX7-NEXT: s_and_b64 vcc, 0, exec
@@ -1723,12 +1723,12 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32
; GFX8-NEXT: v_writelane_b32 v22, vcc_lo, 0
; GFX8-NEXT: v_writelane_b32 v22, vcc_hi, 1
; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x200, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, vcc_lo, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x200, v1
; GFX8-NEXT: v_writelane_b32 v23, s59, 27
; GFX8-NEXT: v_readfirstlane_b32 s59, v0
; GFX8-NEXT: s_and_b64 vcc, 0, exec
>From 2e3e3c16833fecab21e9dbc8ad88cf089139d484 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 2 Oct 2024 09:02:49 +0400
Subject: [PATCH 2/2] Handle vop3 cases
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 5 +-
.../CodeGen/AMDGPU/fold-fi-operand-shrink.mir | 3 +-
.../fold-operands-frame-index.gfx10.mir | 6 +--
.../AMDGPU/fold-operands-frame-index.mir | 19 ++-----
.../CodeGen/AMDGPU/frame-index-elimination.ll | 12 ++---
.../materialize-frame-index-sgpr.gfx10.ll | 52 +++++++++----------
.../AMDGPU/materialize-frame-index-sgpr.ll | 36 ++++++-------
7 files changed, 59 insertions(+), 74 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index fea84247f5ad88..c912a580854c1c 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -198,12 +198,15 @@ bool SIFoldOperandsImpl::frameIndexMayFold(
case AMDGPU::S_ADD_I32:
case AMDGPU::V_ADD_U32_e32:
case AMDGPU::V_ADD_CO_U32_e32:
- // TODO: Handle e64 variants
// TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
// to insert the wave size shift at every point we use the index.
// TODO: Fix depending on visit order to fold immediates into the operand
return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
+ case AMDGPU::V_ADD_U32_e64:
+ case AMDGPU::V_ADD_CO_U32_e64:
+ return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
+ MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
default:
break;
}
diff --git a/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir b/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir
index 2b5ec86244ec2a..8626ac0f23ec79 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir
@@ -183,8 +183,7 @@ body: |
bb.0:
; GCN-LABEL: name: shrink_vgpr_imm_vgpr_fi_v_add_i32_e64_no_carry_out_use
- ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; GCN-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 16, [[V_MOV_B32_e32_]], 0, implicit $exec
+ ; GCN: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]]
%0:vgpr_32 = V_MOV_B32_e32 16, implicit $exec
%1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir
index b46c672ea8edd2..2959f55c78f7ec 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir
@@ -33,8 +33,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_const
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], 128, 0, implicit $exec
+ ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 %stack.0, 128, 0, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
@@ -98,8 +97,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64___fi_const_v
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 128, [[V_MOV_B32_e32_]], 0, implicit $exec
+ ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 128, %stack.0, 0, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
index 97ba179afbbe03..e58783e2dd840e 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
@@ -292,21 +292,10 @@ stack:
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
body: |
bb.0:
- ; GFX9-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm
- ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], 64, 0, implicit $exec
- ; GFX9-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
- ; GFX9-NEXT: SI_RETURN implicit $vgpr0
- ;
- ; GFX10-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm
- ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 %stack.0, 64, 0, implicit $exec
- ; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
- ; GFX10-NEXT: SI_RETURN implicit $vgpr0
- ;
- ; GFX12-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm
- ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 %stack.0, 64, 0, implicit $exec
- ; GFX12-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
- ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_const
+ ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 %stack.0, 64, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
%1:vgpr_32, %2:sreg_64 = V_ADD_CO_U32_e64 %0, 64, 0, implicit $exec
$vgpr0 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 4215ae43345fde..e3cd8028422ddb 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -64,8 +64,8 @@ define void @func_mov_fi_i32_offset() #0 {
; GFX9-MUBUF: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
-; GFX9-FLATSCR: v_mov_b32_e32 [[ADD:v[0-9]+]], s32
-; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 4, [[ADD]]
+; FIXME: Should commute and shrink
+; GFX9-FLATSCR: v_add_u32_e64 v0, 4, s32
; GCN-NOT: v_mov
; GCN: ds_write_b32 v0, v0
@@ -164,12 +164,12 @@ define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8
; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}}
; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
-; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
+; CI: v_add_i32_e64 [[GEP:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]]
-; GFX9-MUBUF: v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32
-; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32
+; GFX9-MUBUF: v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32
+; GFX9-MUBUF: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]]
-; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]]
+; GFX9-FLATSCR: v_add_u32_e64 [[GEP:v[0-9]+]], 4, s32
; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
define void @void_func_byval_struct_i8_i32_ptr_nonentry_block(ptr addrspace(5) byval({ i8, i32 }) %arg0, i32 %arg2) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index 7708c0e4767cf2..a10167e558d77d 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -1426,17 +1426,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX10_1-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v3, 5, s32
; GFX10_1-NEXT: v_writelane_b32 v2, s59, 0
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
-; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0
-; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1
+; GFX10_1-NEXT: v_add_nc_u32_e32 v3, 0x4040, v3
+; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use alloca0 v1
+; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0
-; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0
+; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 0x3ec, v3
+; GFX10_1-NEXT: v_readfirstlane_b32 s59, v1
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use s59, scc
; GFX10_1-NEXT: ;;#ASMEND
@@ -1456,17 +1456,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800
; GFX10_3-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v3, 5, s32
; GFX10_3-NEXT: v_writelane_b32 v2, s59, 0
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
-; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0
-; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1
+; GFX10_3-NEXT: v_add_nc_u32_e32 v3, 0x4040, v3
+; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use alloca0 v1
+; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0
-; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0
+; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 0x3ec, v3
+; GFX10_3-NEXT: v_readfirstlane_b32 s59, v1
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use s59, scc
; GFX10_3-NEXT: ;;#ASMEND
@@ -1485,19 +1485,17 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX11-NEXT: s_add_i32 s1, s32, 0x8040
; GFX11-NEXT: scratch_store_b32 off, v2, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s0, s32, 0x4040
+; GFX11-NEXT: s_add_i32 s0, s32, 64
; GFX11-NEXT: v_writelane_b32 v2, s59, 0
; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_add_i32 s0, s32, 64
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v1, s0
-; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0
+; GFX11-NEXT: s_add_i32 s0, s32, 0x4040
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use alloca0 v1
+; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: v_add_nc_u32_e64 v1, 0x3ec, s0
+; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readfirstlane_b32 s59, v0
+; GFX11-NEXT: v_readfirstlane_b32 s59, v1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s59, scc
; GFX11-NEXT: ;;#ASMEND
@@ -1523,14 +1521,14 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
; GFX12-NEXT: v_writelane_b32 v2, s59, 0
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s32
+; GFX12-NEXT: v_add_nc_u32_e64 v1, 0x3ec, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use alloca0 v1
+; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0
-; GFX12-NEXT: v_readfirstlane_b32 s59, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_readfirstlane_b32 s59, v1
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s59, scc
; GFX12-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 54b3f97d456820..bccf218660d19b 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -1983,17 +1983,17 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
; GFX10_1-NEXT: v_writelane_b32 v23, s30, 0
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: v_writelane_b32 v23, s31, 1
-; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0
-; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1
+; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 0x4040, v1
+; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use alloca0 v1
+; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: v_writelane_b32 v23, s33, 2
-; GFX10_1-NEXT: v_add_nc_u32_e32 v22, 0x200, v0
+; GFX10_1-NEXT: v_add_nc_u32_e32 v22, 0x200, v1
; GFX10_1-NEXT: v_writelane_b32 v23, s34, 3
; GFX10_1-NEXT: v_writelane_b32 v23, s35, 4
; GFX10_1-NEXT: v_writelane_b32 v23, s36, 5
@@ -2070,17 +2070,17 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_3-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
; GFX10_3-NEXT: v_writelane_b32 v23, s30, 0
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: v_writelane_b32 v23, s31, 1
-; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0
-; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1
+; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 0x4040, v1
+; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use alloca0 v1
+; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: v_writelane_b32 v23, s33, 2
-; GFX10_3-NEXT: v_add_nc_u32_e32 v22, 0x200, v0
+; GFX10_3-NEXT: v_add_nc_u32_e32 v22, 0x200, v1
; GFX10_3-NEXT: v_writelane_b32 v23, s34, 3
; GFX10_3-NEXT: v_writelane_b32 v23, s35, 4
; GFX10_3-NEXT: v_writelane_b32 v23, s36, 5
@@ -2156,16 +2156,15 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX11-NEXT: scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v23, s30, 0
-; GFX11-NEXT: s_add_i32 s0, s32, 0x4040
+; GFX11-NEXT: s_add_i32 s0, s32, 64
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-NEXT: s_add_i32 s0, s32, 64
+; GFX11-NEXT: s_add_i32 s0, s32, 0x4040
; GFX11-NEXT: v_writelane_b32 v23, s31, 1
-; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: v_add_nc_u32_e64 v22, 0x200, s0
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x200, v0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use alloca0 v1
+; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: v_writelane_b32 v23, s33, 2
; GFX11-NEXT: v_writelane_b32 v23, s34, 3
@@ -2249,15 +2248,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v23, s30, 0
; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
+; GFX12-NEXT: v_mov_b32_e32 v0, s32
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_add_nc_u32_e64 v22, 0x200, s0
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: v_writelane_b32 v23, s31, 1
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use alloca0 v1
+; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_nc_u32_e32 v22, 0x200, v0
; GFX12-NEXT: v_writelane_b32 v23, s33, 2
; GFX12-NEXT: v_writelane_b32 v23, s34, 3
; GFX12-NEXT: v_writelane_b32 v23, s35, 4
More information about the llvm-branch-commits
mailing list