[llvm] ef91cd3 - AMDGPU: Handle folding frame indexes into add with immediate (#110738)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 19 12:33:07 PDT 2024
Author: Matt Arsenault
Date: 2024-10-19T12:33:03-07:00
New Revision: ef91cd3f018411e0ba7989003d7617041e35f650
URL: https://github.com/llvm/llvm-project/commit/ef91cd3f018411e0ba7989003d7617041e35f650
DIFF: https://github.com/llvm/llvm-project/commit/ef91cd3f018411e0ba7989003d7617041e35f650.diff
LOG: AMDGPU: Handle folding frame indexes into add with immediate (#110738)
Added:
Modified:
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
llvm/test/CodeGen/AMDGPU/flat-scratch.ll
llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir
llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir
llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 1e2c77b08b9a63..c912a580854c1c 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -194,6 +194,23 @@ bool SIFoldOperandsImpl::frameIndexMayFold(
return false;
const unsigned Opc = UseMI.getOpcode();
+ switch (Opc) {
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::V_ADD_U32_e32:
+ case AMDGPU::V_ADD_CO_U32_e32:
+ // TODO: Possibly relax hasOneUse. It matters more for mubuf, since we have
+ // to insert the wave size shift at every point we use the index.
+ // TODO: Fix depending on visit order to fold immediates into the operand
+ return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
+ MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
+ case AMDGPU::V_ADD_U32_e64:
+ case AMDGPU::V_ADD_CO_U32_e64:
+ return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
+ MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
+ default:
+ break;
+ }
+
if (TII->isMUBUF(UseMI))
return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
if (!TII->isFLATScratch(UseMI))
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 3e4b43d9cfcd34..c5d4ef23070eb5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -4705,8 +4705,7 @@ define amdgpu_ps void @large_offset() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_movk_i32 s0, 0x810
-; GFX10-NEXT: s_addk_i32 s0, 0x3c0
+; GFX10-NEXT: s_movk_i32 s0, 0xbd0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v0
@@ -4823,8 +4822,7 @@ define amdgpu_ps void @large_offset() {
; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-PAL-NEXT: s_movk_i32 s0, 0x810
-; GFX10-PAL-NEXT: s_addk_i32 s0, 0x3c0
+; GFX10-PAL-NEXT: s_movk_i32 s0, 0xbd0
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir b/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir
index 2b5ec86244ec2a..8626ac0f23ec79 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir
@@ -183,8 +183,7 @@ body: |
bb.0:
; GCN-LABEL: name: shrink_vgpr_imm_vgpr_fi_v_add_i32_e64_no_carry_out_use
- ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; GCN-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 16, [[V_MOV_B32_e32_]], 0, implicit $exec
+ ; GCN: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
; GCN-NEXT: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_]]
%0:vgpr_32 = V_MOV_B32_e32 16, implicit $exec
%1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir
index 0d6511cbfceb21..d10dec6ca8289f 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.gfx10.mir
@@ -13,8 +13,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__v_add_u32_e32__const_v_fi
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_MOV_B32_e32_]], implicit $exec
+ ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, %stack.0, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_U32_e32_]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
@@ -34,8 +33,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_const
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], 128, 0, implicit $exec
+ ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 %stack.0, 128, 0, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
@@ -57,8 +55,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__v_add_u32_e64__const_v_fi
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 128, [[V_MOV_B32_e32_]], 0, implicit $exec
+ ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 128, %stack.0, 0, implicit $exec
; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
@@ -78,8 +75,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_const
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], 128, 0, implicit $exec
+ ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 128, 0, implicit $exec
; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
@@ -99,8 +95,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64___fi_const_v
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 128, [[V_MOV_B32_e32_]], 0, implicit $exec
+ ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32 = V_ADD_CO_U32_e64 128, %stack.0, 0, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
index aa91a4f9f988fc..280126a0d7cd22 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir
@@ -14,8 +14,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_const
- ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
- ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_MOV_B32_]], 128, implicit-def $scc
+ ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 128, implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
@@ -35,8 +34,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__const_fi
- ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
- ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 128, [[S_MOV_B32_]], implicit-def $scc
+ ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 128, %stack.0, implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 %stack.0
@@ -56,8 +54,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__materializedconst_fi
- ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
- ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, [[S_MOV_B32_]], implicit-def $scc
+ ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 256
@@ -101,8 +98,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_1
- ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0
- ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, [[S_MOV_B32_]], implicit-def $scc
+ ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:sreg_32 = S_MOV_B32 256
@@ -173,8 +169,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__v_add_u32_e32__const_v_fi
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, [[V_MOV_B32_e32_]], implicit $exec
+ ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 128, %stack.0, implicit $exec
; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e32_]]
; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
@@ -215,21 +210,10 @@ stack:
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
body: |
bb.0:
- ; GFX9-LABEL: name: fold_frame_index__v_add_u32_e64__imm_v_fi
- ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, [[V_MOV_B32_e32_]], 0, implicit $exec
- ; GFX9-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]]
- ; GFX9-NEXT: SI_RETURN implicit $sgpr4
- ;
- ; GFX10-LABEL: name: fold_frame_index__v_add_u32_e64__imm_v_fi
- ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, %stack.0, 0, implicit $exec
- ; GFX10-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]]
- ; GFX10-NEXT: SI_RETURN implicit $sgpr4
- ;
- ; GFX12-LABEL: name: fold_frame_index__v_add_u32_e64__imm_v_fi
- ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, %stack.0, 0, implicit $exec
- ; GFX12-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]]
- ; GFX12-NEXT: SI_RETURN implicit $sgpr4
+ ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e64__imm_v_fi
+ ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 64, %stack.0, 0, implicit $exec
+ ; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]]
+ ; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
%1:vgpr_32 = V_ADD_U32_e64 64, %0, 0, implicit $exec
$sgpr4 = COPY %1
@@ -246,21 +230,10 @@ stack:
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
body: |
bb.0:
- ; GFX9-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_imm
- ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], 64, 0, implicit $exec
- ; GFX9-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]]
- ; GFX9-NEXT: SI_RETURN implicit $sgpr4
- ;
- ; GFX10-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_imm
- ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 64, 0, implicit $exec
- ; GFX10-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]]
- ; GFX10-NEXT: SI_RETURN implicit $sgpr4
- ;
- ; GFX12-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_imm
- ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 64, 0, implicit $exec
- ; GFX12-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]]
- ; GFX12-NEXT: SI_RETURN implicit $sgpr4
+ ; CHECK-LABEL: name: fold_frame_index__v_add_u32_e64___v_fi_imm
+ ; CHECK: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 64, 0, implicit $exec
+ ; CHECK-NEXT: $sgpr4 = COPY [[V_ADD_U32_e64_]]
+ ; CHECK-NEXT: SI_RETURN implicit $sgpr4
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
%1:vgpr_32 = V_ADD_U32_e64 %0, 64, 0, implicit $exec
$sgpr4 = COPY %1
@@ -278,8 +251,7 @@ stack:
body: |
bb.0:
; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e32__const_v_fi
- ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; CHECK-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 128, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec
+ ; CHECK: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 128, %stack.0, implicit-def $vcc, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e32_]]
; CHECK-NEXT: SI_RETURN implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
@@ -298,21 +270,10 @@ stack:
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
body: |
bb.0:
- ; GFX9-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm
- ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], 64, 0, implicit $exec
- ; GFX9-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
- ; GFX9-NEXT: SI_RETURN implicit $vgpr0
- ;
- ; GFX10-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm
- ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 %stack.0, 64, 0, implicit $exec
- ; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
- ; GFX10-NEXT: SI_RETURN implicit $vgpr0
- ;
- ; GFX12-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm
- ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 %stack.0, 64, 0, implicit $exec
- ; GFX12-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
- ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64__v_fi_imm
+ ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 %stack.0, 64, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
%1:vgpr_32, %2:sreg_64 = V_ADD_CO_U32_e64 %0, 64, 0, implicit $exec
$vgpr0 = COPY %1
@@ -329,21 +290,10 @@ stack:
- { id: 0, size: 16384, alignment: 4, local-offset: 0 }
body: |
bb.0:
- ; GFX9-LABEL: name: fold_frame_index__v_add_co_u32_e64__imm_v_fi
- ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
- ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 64, [[V_MOV_B32_e32_]], 0, implicit $exec
- ; GFX9-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
- ; GFX9-NEXT: SI_RETURN implicit $vgpr0
- ;
- ; GFX10-LABEL: name: fold_frame_index__v_add_co_u32_e64__imm_v_fi
- ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec
- ; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
- ; GFX10-NEXT: SI_RETURN implicit $vgpr0
- ;
- ; GFX12-LABEL: name: fold_frame_index__v_add_co_u32_e64__imm_v_fi
- ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec
- ; GFX12-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
- ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+ ; CHECK-LABEL: name: fold_frame_index__v_add_co_u32_e64__imm_v_fi
+ ; CHECK: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 64, %stack.0, 0, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_CO_U32_e64_]]
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr0
%0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
%1:vgpr_32, %2:sreg_64 = V_ADD_CO_U32_e64 64, %0, 0, implicit $exec
$vgpr0 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index 4215ae43345fde..e3cd8028422ddb 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -64,8 +64,8 @@ define void @func_mov_fi_i32_offset() #0 {
; GFX9-MUBUF: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
; GFX9-MUBUF-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
-; GFX9-FLATSCR: v_mov_b32_e32 [[ADD:v[0-9]+]], s32
-; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 4, [[ADD]]
+; FIXME: Should commute and shrink
+; GFX9-FLATSCR: v_add_u32_e64 v0, 4, s32
; GCN-NOT: v_mov
; GCN: ds_write_b32 v0, v0
@@ -164,12 +164,12 @@ define void @void_func_byval_struct_i8_i32_ptr_value(ptr addrspace(5) byval({ i8
; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4 glc{{$}}
; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
-; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
+; CI: v_add_i32_e64 [[GEP:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]]
-; GFX9-MUBUF: v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32
-; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32
+; GFX9-MUBUF: v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32
+; GFX9-MUBUF: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]]
-; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]]
+; GFX9-FLATSCR: v_add_u32_e64 [[GEP:v[0-9]+]], 4, s32
; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
define void @void_func_byval_struct_i8_i32_ptr_nonentry_block(ptr addrspace(5) byval({ i8, i32 }) %arg0, i32 %arg2) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index e86ef52e413b69..302b140e32f3aa 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -1426,17 +1426,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX10_1-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v3, 5, s32
; GFX10_1-NEXT: v_writelane_b32 v2, s59, 0
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
-; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0
-; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1
+; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 0x442c, v3
+; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use alloca0 v1
+; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
-; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0
-; GFX10_1-NEXT: v_readfirstlane_b32 s59, v0
+; GFX10_1-NEXT: v_readfirstlane_b32 s59, v1
; GFX10_1-NEXT: ;;#ASMSTART
; GFX10_1-NEXT: ; use s59, scc
; GFX10_1-NEXT: ;;#ASMEND
@@ -1456,17 +1455,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800
; GFX10_3-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v3, 5, s32
; GFX10_3-NEXT: v_writelane_b32 v2, s59, 0
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
-; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0
-; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1
+; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 0x442c, v3
+; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use alloca0 v1
+; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
-; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0
-; GFX10_3-NEXT: v_readfirstlane_b32 s59, v0
+; GFX10_3-NEXT: v_readfirstlane_b32 s59, v1
; GFX10_3-NEXT: ;;#ASMSTART
; GFX10_3-NEXT: ; use s59, scc
; GFX10_3-NEXT: ;;#ASMEND
@@ -1485,19 +1483,16 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX11-NEXT: s_add_i32 s1, s32, 0x8040
; GFX11-NEXT: scratch_store_b32 off, v2, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
-; GFX11-NEXT: s_add_i32 s0, s32, 0x4040
-; GFX11-NEXT: v_writelane_b32 v2, s59, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_add_i32 s0, s32, 64
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: v_writelane_b32 v2, s59, 0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s32
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use alloca0 v1
+; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readfirstlane_b32 s59, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x442c, v3
+; GFX11-NEXT: v_readfirstlane_b32 s59, v1
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s59, scc
; GFX11-NEXT: ;;#ASMEND
@@ -1520,17 +1515,15 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX12-NEXT: scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
-; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
+; GFX12-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v3, s32
; GFX12-NEXT: v_writelane_b32 v2, s59, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use alloca0 v1
+; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x3ec, v0
-; GFX12-NEXT: v_readfirstlane_b32 s59, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x43ec, v3
+; GFX12-NEXT: v_readfirstlane_b32 s59, v1
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s59, scc
; GFX12-NEXT: ;;#ASMEND
@@ -1550,10 +1543,8 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset(
; GFX8-NEXT: s_add_i32 s6, s32, 0x201000
; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
-; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3ec, v0
+; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x442c, v1
; GFX8-NEXT: v_writelane_b32 v2, s59, 0
; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32
; GFX8-NEXT: v_readfirstlane_b32 s59, v0
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index e9cd94620a6b9a..308411fa225dae 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -1582,12 +1582,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill
; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6
+; GFX7-NEXT: v_lshr_b32_e64 v1, s32, 6
; GFX7-NEXT: v_writelane_b32 v22, vcc_lo, 0
; GFX7-NEXT: v_writelane_b32 v22, vcc_hi, 1
-; GFX7-NEXT: s_movk_i32 vcc_lo, 0x4040
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, vcc_lo, v0
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x200, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x4240, v1
; GFX7-NEXT: v_writelane_b32 v23, s59, 27
; GFX7-NEXT: v_readfirstlane_b32 s59, v0
; GFX7-NEXT: s_and_b64 vcc, 0, exec
@@ -1723,12 +1721,10 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill
; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32
; GFX8-NEXT: v_writelane_b32 v22, vcc_lo, 0
; GFX8-NEXT: v_writelane_b32 v22, vcc_hi, 1
-; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x200, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x4240, v1
; GFX8-NEXT: v_writelane_b32 v23, s59, 27
; GFX8-NEXT: v_readfirstlane_b32 s59, v0
; GFX8-NEXT: s_and_b64 vcc, 0, exec
@@ -1983,17 +1979,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3
; GFX10_1-NEXT: s_mov_b32 exec_lo, s4
; GFX10_1-NEXT: v_writelane_b32 v23, s30, 0
-; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_1-NEXT: v_writelane_b32 v23, s31, 1
-; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0
-; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 64, v1
+; GFX10_1-NEXT: v_add_nc_u32_e32 v22, 0x4240, v1
+; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_1-NEXT: ;;#ASMSTART
-; GFX10_1-NEXT: ; use alloca0 v1
+; GFX10_1-NEXT: ; use alloca0 v0
; GFX10_1-NEXT: ;;#ASMEND
; GFX10_1-NEXT: v_writelane_b32 v23, s33, 2
-; GFX10_1-NEXT: v_add_nc_u32_e32 v22, 0x200, v0
; GFX10_1-NEXT: v_writelane_b32 v23, s34, 3
; GFX10_1-NEXT: v_writelane_b32 v23, s35, 4
; GFX10_1-NEXT: v_writelane_b32 v23, s36, 5
@@ -2070,17 +2065,16 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX10_3-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill
; GFX10_3-NEXT: s_mov_b32 exec_lo, s4
; GFX10_3-NEXT: v_writelane_b32 v23, s30, 0
-; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32
+; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32
; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo
; GFX10_3-NEXT: v_writelane_b32 v23, s31, 1
-; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 0x4040, v0
-; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 64, v1
+; GFX10_3-NEXT: v_add_nc_u32_e32 v22, 0x4240, v1
+; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0
; GFX10_3-NEXT: ;;#ASMSTART
-; GFX10_3-NEXT: ; use alloca0 v1
+; GFX10_3-NEXT: ; use alloca0 v0
; GFX10_3-NEXT: ;;#ASMEND
; GFX10_3-NEXT: v_writelane_b32 v23, s33, 2
-; GFX10_3-NEXT: v_add_nc_u32_e32 v22, 0x200, v0
; GFX10_3-NEXT: v_writelane_b32 v23, s34, 3
; GFX10_3-NEXT: v_writelane_b32 v23, s35, 4
; GFX10_3-NEXT: v_writelane_b32 v23, s36, 5
@@ -2156,17 +2150,15 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX11-NEXT: scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s0
; GFX11-NEXT: v_writelane_b32 v23, s30, 0
-; GFX11-NEXT: s_add_i32 s0, s32, 0x4040
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_add_i32 s0, s32, 64
-; GFX11-NEXT: v_writelane_b32 v23, s31, 1
-; GFX11-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x200, v0
+; GFX11-NEXT: v_writelane_b32 v23, s31, 1
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use alloca0 v1
+; GFX11-NEXT: ; use alloca0 v0
; GFX11-NEXT: ;;#ASMEND
+; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x4240, v1
; GFX11-NEXT: v_writelane_b32 v23, s33, 2
; GFX11-NEXT: v_writelane_b32 v23, s34, 3
; GFX11-NEXT: v_writelane_b32 v23, s35, 4
@@ -2248,16 +2240,14 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_mov_b32 exec_lo, s0
; GFX12-NEXT: v_writelane_b32 v23, s30, 0
-; GFX12-NEXT: s_add_co_i32 s0, s32, 0x4000
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, s32
; GFX12-NEXT: s_and_b32 s0, 0, exec_lo
-; GFX12-NEXT: v_writelane_b32 v23, s31, 1
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use alloca0 v1
+; GFX12-NEXT: ; use alloca0 v0
; GFX12-NEXT: ;;#ASMEND
+; GFX12-NEXT: v_writelane_b32 v23, s31, 1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add_nc_u32_e32 v22, 0x200, v0
+; GFX12-NEXT: v_add_nc_u32_e32 v22, 0x4200, v1
; GFX12-NEXT: v_writelane_b32 v23, s33, 2
; GFX12-NEXT: v_writelane_b32 v23, s34, 3
; GFX12-NEXT: v_writelane_b32 v23, s35, 4
More information about the llvm-commits
mailing list