[llvm] 4b111dd - [AMDGPU] Fix VOPD dependency checks during combine
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 15 11:21:27 PDT 2023
Author: Stanislav Mekhanoshin
Date: 2023-06-15T11:16:33-07:00
New Revision: 4b111dd798cd93e8d2f1c855d577e35d6ac6cebe
URL: https://github.com/llvm/llvm-project/commit/4b111dd798cd93e8d2f1c855d577e35d6ac6cebe
DIFF: https://github.com/llvm/llvm-project/commit/4b111dd798cd93e8d2f1c855d577e35d6ac6cebe.diff
LOG: [AMDGPU] Fix VOPD dependency checks during combine
Check superreg/subreg defs of an instruction when checking for
dependencies. This may cause some regressions, but better be
safe than sorry. Changed tests are affected because of the
implicit-defs of the superregs.
Differential Revision: https://reviews.llvm.org/D152943
Added:
Modified:
llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
llvm/test/CodeGen/AMDGPU/vopd-combine.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 95ea42267ccf3..29c9b9ccf2761 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -63,7 +63,7 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
}() && "Expected FirstMI to precede SecondMI");
// Cannot pair dependent instructions
for (const auto &Use : SecondMI.uses())
- if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg()))
+ if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), TRI))
return false;
auto getVRegIdx = [&](unsigned OpcodeIdx, unsigned OperandIdx) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index 2f5f09e4d74a1..57bd5b31891f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -1293,19 +1293,18 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, 4, v2
; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff
; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0
-; GFX11-NEXT: v_not_b32_e32 v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_or_b32 v5, v5, v3, v2
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
+; GFX11-NEXT: v_not_b32_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_and_or_b32 v5, v5, v3, v2
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: v_mov_b32_e32 v3, 0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1428,8 +1427,8 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: v_mov_b32_e32 v3, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4511,21 +4510,21 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off
; GFX11-NEXT: global_load_b128 v[7:10], v[0:1], off offset:16
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; GFX11-NEXT: v_dual_mov_b32 v11, 16 :: v_dual_and_b32 v0, 0xffff, v2
; GFX11-NEXT: s_and_b32 s0, s2, 1
; GFX11-NEXT: s_lshr_b32 m0, s2, 1
; GFX11-NEXT: s_lshl_b32 s0, s0, 4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v11, 16 :: v_dual_lshlrev_b32 v0, s0, v0
-; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX11-NEXT: v_mov_b32_e32 v12, 0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s0, v0
+; GFX11-NEXT: s_lshl_b32 s0, 0xffff, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: s_not_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_movrels_b32_e32 v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_and_or_b32 v2, v1, s0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_movreld_b32_e32 v3, v2
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 9988aaf2fbbd4..8ae7b83814c9d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -2177,19 +2177,18 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, 3, v2
; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff
; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0
-; GFX11-NEXT: v_not_b32_e32 v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_and_or_b32 v5, v5, v3, v2
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
+; GFX11-NEXT: v_not_b32_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_and_or_b32 v5, v5, v3, v2
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: v_mov_b32_e32 v3, 0
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2314,8 +2313,8 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: v_mov_b32_e32 v3, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
index 8e02052d7a905..1dc1ec7d81ad2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll
@@ -98,19 +98,19 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
-; GFX11-NEXT: v_mov_b32_e32 v11, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
-; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v11, v7
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: v_mov_b32_e32 v0, v7
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11
+; GFX11-NEXT: v_mov_b32_e32 v4, v11
; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v7, v4, s[10:11]
@@ -186,19 +186,19 @@ define amdgpu_ps <4 x float> @load_2d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
-; GFX11-NEXT: v_mov_b32_e32 v11, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
-; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v11, v7
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: v_mov_b32_e32 v0, v7
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11
+; GFX11-NEXT: v_mov_b32_e32 v4, v11
; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v7, v4, s[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
index 6a7e847528898..7b4fbfc6fa80e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll
@@ -101,10 +101,10 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_mov_b32_e32 v10, v9
; GFX11-NEXT: v_mov_b32_e32 v11, v9
-; GFX11-NEXT: v_mov_b32_e32 v13, v9
; GFX11-NEXT: v_mov_b32_e32 v12, v9
-; GFX11-NEXT: v_mov_b32_e32 v10, v9
+; GFX11-NEXT: v_mov_b32_e32 v13, v9
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
@@ -113,9 +113,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: v_mov_b32_e32 v0, v9
+; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10
; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12
-; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13
+; GFX11-NEXT: v_mov_b32_e32 v4, v13
; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v9, v4, s[10:11]
@@ -194,10 +194,10 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_mov_b32_e32 v10, v9
; GFX11-NEXT: v_mov_b32_e32 v11, v9
-; GFX11-NEXT: v_mov_b32_e32 v13, v9
; GFX11-NEXT: v_mov_b32_e32 v12, v9
-; GFX11-NEXT: v_mov_b32_e32 v10, v9
+; GFX11-NEXT: v_mov_b32_e32 v13, v9
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
@@ -206,9 +206,9 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: v_mov_b32_e32 v0, v9
+; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10
; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12
-; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13
+; GFX11-NEXT: v_mov_b32_e32 v4, v13
; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v9, v4, s[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
index 8d532e4912017..f0467469b4e88 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll
@@ -106,17 +106,17 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: v_mov_b32_e32 v9, v7
-; GFX11-NEXT: v_mov_b32_e32 v11, v7
-; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_mov_b32_e32 v11, v7
; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v2, v9
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX11-NEXT: v_mov_b32_e32 v4, v11
; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -197,17 +197,17 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: v_mov_b32_e32 v9, v7
-; GFX11-NEXT: v_mov_b32_e32 v11, v7
-; GFX11-NEXT: v_mov_b32_e32 v10, v7
; GFX11-NEXT: v_mov_b32_e32 v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v10, v7
+; GFX11-NEXT: v_mov_b32_e32 v11, v7
; GFX11-NEXT: v_lshl_or_b32 v5, v1, 16, v0
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v2, v9
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
; GFX11-NEXT: v_mov_b32_e32 v4, v11
; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
index f22877e3c62e2..6f9ee990e9879 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll
@@ -100,19 +100,19 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, ptr
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v9, v8
+; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_mov_b32_e32 v11, v8
; GFX11-NEXT: v_mov_b32_e32 v12, v8
-; GFX11-NEXT: v_mov_b32_e32 v10, v8
-; GFX11-NEXT: v_mov_b32_e32 v9, v8
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: v_mov_b32_e32 v0, v8
-; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9
-; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12
+; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-NEXT: v_mov_b32_e32 v4, v12
; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v8, v4, s[10:11]
@@ -190,19 +190,19 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX11-NEXT: s_mov_b32 s0, s2
; GFX11-NEXT: s_mov_b32 s1, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v9, v8
+; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_mov_b32_e32 v11, v8
; GFX11-NEXT: v_mov_b32_e32 v12, v8
-; GFX11-NEXT: v_mov_b32_e32 v10, v8
-; GFX11-NEXT: v_mov_b32_e32 v9, v8
; GFX11-NEXT: s_mov_b32 s2, s4
; GFX11-NEXT: s_mov_b32 s3, s5
; GFX11-NEXT: s_mov_b32 s4, s6
; GFX11-NEXT: s_mov_b32 s5, s7
; GFX11-NEXT: s_mov_b32 s6, s8
; GFX11-NEXT: s_mov_b32 s7, s9
-; GFX11-NEXT: v_mov_b32_e32 v0, v8
-; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9
-; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12
+; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-NEXT: v_mov_b32_e32 v4, v12
; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v8, v4, s[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 5ef8712b82c40..02027dd4bbc3f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -525,8 +525,8 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX11-NEXT: ; %bb.1: ; %else
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v2, v0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_add_nc_u32 v3, v3, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
+; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
; GFX11-NEXT: ; %bb.2: ; %Flow
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: ; %bb.3: ; %if
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
index e4ac4ea4d7223..068f57939c6ca 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -125,13 +125,13 @@ define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1
; GFX11-LABEL: load_1d_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0
+; GFX11-NEXT: v_mov_b32_e32 v7, v6
+; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_mov_b32_e32 v9, v6
; GFX11-NEXT: v_mov_b32_e32 v10, v6
-; GFX11-NEXT: v_mov_b32_e32 v8, v6
-; GFX11-NEXT: v_mov_b32_e32 v7, v6
-; GFX11-NEXT: v_mov_b32_e32 v0, v6
-; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7
-; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10
+; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
+; GFX11-NEXT: v_mov_b32_e32 v4, v10
; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v6, v4, s[8:9]
@@ -229,13 +229,13 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1
; GFX11-LABEL: load_1d_lwe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0
+; GFX11-NEXT: v_mov_b32_e32 v7, v6
+; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_mov_b32_e32 v9, v6
; GFX11-NEXT: v_mov_b32_e32 v10, v6
-; GFX11-NEXT: v_mov_b32_e32 v8, v6
-; GFX11-NEXT: v_mov_b32_e32 v7, v6
-; GFX11-NEXT: v_mov_b32_e32 v0, v6
-; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7
-; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10
+; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
+; GFX11-NEXT: v_mov_b32_e32 v4, v10
; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v6, v4, s[8:9]
@@ -372,14 +372,13 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1
; GFX11-LABEL: load_2d_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1
-; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
-; GFX11-NEXT: v_mov_b32_e32 v11, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
-; GFX11-NEXT: v_mov_b32_e32 v8, v7
-; GFX11-NEXT: v_mov_b32_e32 v0, v7
+; GFX11-NEXT: v_mov_b32_e32 v11, v7
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11
+; GFX11-NEXT: v_mov_b32_e32 v4, v11
; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v7, v4, s[8:9]
@@ -521,13 +520,13 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, ptr addrspa
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
+; GFX11-NEXT: v_mov_b32_e32 v9, v8
+; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_mov_b32_e32 v11, v8
; GFX11-NEXT: v_mov_b32_e32 v12, v8
-; GFX11-NEXT: v_mov_b32_e32 v10, v8
-; GFX11-NEXT: v_mov_b32_e32 v9, v8
-; GFX11-NEXT: v_mov_b32_e32 v0, v8
-; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9
-; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12
+; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-NEXT: v_mov_b32_e32 v4, v12
; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v8, v4, s[8:9]
@@ -669,13 +668,13 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, ptr addrspace
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
+; GFX11-NEXT: v_mov_b32_e32 v9, v8
+; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_mov_b32_e32 v11, v8
; GFX11-NEXT: v_mov_b32_e32 v12, v8
-; GFX11-NEXT: v_mov_b32_e32 v10, v8
-; GFX11-NEXT: v_mov_b32_e32 v9, v8
-; GFX11-NEXT: v_mov_b32_e32 v0, v8
-; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9
-; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12
+; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-NEXT: v_mov_b32_e32 v4, v12
; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v8, v4, s[8:9]
@@ -812,14 +811,13 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, ptr addrsp
; GFX11-LABEL: load_1darray_tfe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1
-; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
-; GFX11-NEXT: v_mov_b32_e32 v11, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
-; GFX11-NEXT: v_mov_b32_e32 v8, v7
-; GFX11-NEXT: v_mov_b32_e32 v0, v7
+; GFX11-NEXT: v_mov_b32_e32 v11, v7
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11
+; GFX11-NEXT: v_mov_b32_e32 v4, v11
; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v7, v4, s[8:9]
@@ -961,13 +959,13 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, ptr addrsp
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
+; GFX11-NEXT: v_mov_b32_e32 v9, v8
+; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_mov_b32_e32 v11, v8
; GFX11-NEXT: v_mov_b32_e32 v12, v8
-; GFX11-NEXT: v_mov_b32_e32 v10, v8
-; GFX11-NEXT: v_mov_b32_e32 v9, v8
-; GFX11-NEXT: v_mov_b32_e32 v0, v8
-; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9
-; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12
+; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-NEXT: v_mov_b32_e32 v4, v12
; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v8, v4, s[8:9]
@@ -1109,13 +1107,13 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
+; GFX11-NEXT: v_mov_b32_e32 v9, v8
+; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_mov_b32_e32 v11, v8
; GFX11-NEXT: v_mov_b32_e32 v12, v8
-; GFX11-NEXT: v_mov_b32_e32 v10, v8
-; GFX11-NEXT: v_mov_b32_e32 v9, v8
-; GFX11-NEXT: v_mov_b32_e32 v0, v8
-; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9
-; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12
+; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-NEXT: v_mov_b32_e32 v4, v12
; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v8, v4, s[8:9]
@@ -1261,14 +1259,13 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, v3
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
-; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, v9
; GFX11-NEXT: v_mov_b32_e32 v11, v9
-; GFX11-NEXT: v_mov_b32_e32 v13, v9
; GFX11-NEXT: v_mov_b32_e32 v12, v9
-; GFX11-NEXT: v_mov_b32_e32 v10, v9
-; GFX11-NEXT: v_mov_b32_e32 v0, v9
+; GFX11-NEXT: v_mov_b32_e32 v13, v9
+; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10
; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12
-; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13
+; GFX11-NEXT: v_mov_b32_e32 v4, v13
; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v9, v4, s[8:9]
@@ -1405,14 +1402,13 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspa
; GFX11-LABEL: load_mip_1d_lwe:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1
-; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v7
; GFX11-NEXT: v_mov_b32_e32 v9, v7
-; GFX11-NEXT: v_mov_b32_e32 v11, v7
; GFX11-NEXT: v_mov_b32_e32 v10, v7
-; GFX11-NEXT: v_mov_b32_e32 v8, v7
-; GFX11-NEXT: v_mov_b32_e32 v0, v7
+; GFX11-NEXT: v_mov_b32_e32 v11, v7
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11
+; GFX11-NEXT: v_mov_b32_e32 v4, v11
; GFX11-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v7, v4, s[8:9]
@@ -1554,13 +1550,13 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, ptr addrspa
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0
; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
+; GFX11-NEXT: v_mov_b32_e32 v9, v8
+; GFX11-NEXT: v_mov_b32_e32 v10, v8
; GFX11-NEXT: v_mov_b32_e32 v11, v8
; GFX11-NEXT: v_mov_b32_e32 v12, v8
-; GFX11-NEXT: v_mov_b32_e32 v10, v8
-; GFX11-NEXT: v_mov_b32_e32 v9, v8
-; GFX11-NEXT: v_mov_b32_e32 v0, v8
-; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9
-; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12
+; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9
+; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11
+; GFX11-NEXT: v_mov_b32_e32 v4, v12
; GFX11-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v8, v4, s[8:9]
@@ -1958,11 +1954,11 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, ptr a
; GFX11-LABEL: load_1d_tfe_V4_dmask3:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, 0
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: v_mov_b32_e32 v7, v5
; GFX11-NEXT: v_mov_b32_e32 v8, v5
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v3, v8
-; GFX11-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v2, v7
+; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
; GFX11-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v5, v3, s[8:9]
@@ -2048,10 +2044,10 @@ define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, ptr a
; GFX11-LABEL: load_1d_tfe_V4_dmask2:
; GFX11: ; %bb.0: ; %main_body
; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, 0
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: v_mov_b32_e32 v5, v4
-; GFX11-NEXT: v_mov_b32_e32 v0, v4
-; GFX11-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: v_mov_b32_e32 v2, v6
; GFX11-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v4, v2, s[8:9]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index d665a3a17fe3c..2dd8bfb4263b7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -105,13 +105,13 @@ define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-NEXT: s_mov_b32 s14, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0
+; GFX11-NEXT: v_mov_b32_e32 v7, v6
+; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_mov_b32_e32 v9, v6
; GFX11-NEXT: v_mov_b32_e32 v10, v6
-; GFX11-NEXT: v_mov_b32_e32 v8, v6
-; GFX11-NEXT: v_mov_b32_e32 v7, v6
-; GFX11-NEXT: v_mov_b32_e32 v0, v6
-; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7
-; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10
+; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
+; GFX11-NEXT: v_mov_b32_e32 v4, v10
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -568,13 +568,13 @@ define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX11-NEXT: s_mov_b32 s14, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0
+; GFX11-NEXT: v_mov_b32_e32 v7, v6
+; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: v_mov_b32_e32 v9, v6
; GFX11-NEXT: v_mov_b32_e32 v10, v6
-; GFX11-NEXT: v_mov_b32_e32 v8, v6
-; GFX11-NEXT: v_mov_b32_e32 v7, v6
-; GFX11-NEXT: v_mov_b32_e32 v0, v6
-; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7
-; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10
+; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v3, v9
+; GFX11-NEXT: v_mov_b32_e32 v4, v10
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe
; GFX11-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
index 0da0d9c23a891..2cbabe61d1fe7 100644
--- a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
@@ -19,6 +19,7 @@
define void @vopd_schedule_unconstrained_2() { ret void }
define void @vopd_mov_fixup() { ret void }
define void @vopd_mov_fixup_fail() { ret void }
+ define void @vopd_no_combine_dependent_subreg() { ret void }
...
---
@@ -541,3 +542,22 @@ body: |
$vgpr0 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
$vgpr1 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds + 4, implicit $exec
...
+
+---
+name: vopd_no_combine_dependent_subreg
+tracksRegLiveness: true
+body: |
+ bb.0:
+
+ ; SCHED-LABEL: name: vopd_no_combine_dependent_subreg
+ ; SCHED: $vgpr0 = IMPLICIT_DEF
+ ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit-def $vgpr2_vgpr3, implicit $exec
+ ; SCHED-NEXT: $vgpr5 = V_ADD_F32_e32 killed $vgpr0, killed $vgpr3, implicit $mode, implicit $exec
+ ; PAIR-LABEL: name: vopd_no_combine_dependent_subreg
+ ; PAIR: $vgpr0 = IMPLICIT_DEF
+ ; PAIR-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit-def $vgpr2_vgpr3, implicit $exec
+ ; PAIR-NEXT: $vgpr5 = V_ADD_F32_e32 killed $vgpr0, killed $vgpr3, implicit $mode, implicit $exec
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr2 = V_MOV_B32_e32 0, implicit-def $vgpr2_vgpr3, implicit $exec
+ $vgpr5 = V_ADD_F32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec
+...
More information about the llvm-commits
mailing list