[llvm] [AMDGPU][True16][CodeGen] insert vgpr32 for SelectMadMixFma for f16/bf16 (PR #159648)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 22 11:20:17 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/159648
>From aaef22b942d71f449853d36550735c249065548b Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 22 Sep 2025 13:43:20 -0400
Subject: [PATCH] madmixfma mod
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 28 ++-
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 36 ++-
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 54 +++--
llvm/test/CodeGen/AMDGPU/frem.ll | 212 +++++++++---------
llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll | 75 +++++--
llvm/test/CodeGen/AMDGPU/mad-mix.ll | 5 +-
6 files changed, 220 insertions(+), 190 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c2fca79979e1b..2158a4d0e2076 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -4078,18 +4078,26 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
// register.
Mods |= SISrcMods::OP_SEL_1;
- if (IsExtractHigh ||
- (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
- Mods |= SISrcMods::OP_SEL_0;
+ if (Src.getValueSizeInBits() == 16) {
+ if (isExtractHiElt(Src, Src)) {
+ Mods |= SISrcMods::OP_SEL_0;
- // TODO: Should we try to look for neg/abs here?
- }
+ // TODO: Should we try to look for neg/abs here?
+ return true;
+ }
+
+ if (Src.getOpcode() == ISD::TRUNCATE &&
+ Src.getOperand(0).getValueType() == MVT::i32) {
+ Src = Src.getOperand(0);
+ return true;
+ }
+
+ if (Subtarget->useRealTrue16Insts())
+ // In true16 mode, pack src to a 32bit
+ Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
+ } else if (IsExtractHigh)
+ Mods |= SISrcMods::OP_SEL_0;
- // Prevent unnecessary subreg COPY to VGPR_16
- if (Src.getOpcode() == ISD::TRUNCATE &&
- Src.getOperand(0).getValueType() == MVT::i32) {
- Src = Src.getOperand(0);
- }
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 210e09fd9169a..7f6a920d25016 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -137,33 +137,31 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v2.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v5, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_fdiv_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index a859cc91b7fde..fe95d4561d0cd 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1571,25 +1571,24 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0x46000000
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l
+; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, 0x46000000, v1
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, 0x46000000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0]
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v4, v1
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0]
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v0.l, 0x7000
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, 0x7000
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
@@ -1739,25 +1738,24 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 2.0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v1, v1
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v0, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v4, v1
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v0.l, 2.0
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, 2.0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index c4a38dcd7b5f3..78a961ea0da17 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -1433,37 +1433,35 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fast_frem_f16:
@@ -1507,38 +1505,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-TRUE16-NEXT: s_clause 0x1
; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-TRUE16-NEXT: s_clause 0x1
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
; GFX1150-FAKE16-LABEL: fast_frem_f16:
@@ -1583,38 +1579,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1200-TRUE16-NEXT: s_clause 0x1
; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1200-TRUE16-NEXT: s_clause 0x1
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
; GFX1200-FAKE16-LABEL: fast_frem_f16:
@@ -1840,37 +1834,35 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: unsafe_frem_f16:
@@ -1914,38 +1906,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1150-TRUE16-NEXT: s_clause 0x1
; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-TRUE16-NEXT: s_clause 0x1
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
; GFX1150-FAKE16-LABEL: unsafe_frem_f16:
@@ -1990,38 +1980,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1200-TRUE16-NEXT: s_clause 0x1
; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1200-TRUE16-NEXT: s_clause 0x1
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
; GFX1200-FAKE16-LABEL: unsafe_frem_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 1ae3434db6da5..030b1b58f5b1a 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -65,10 +65,10 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
; SDAG-GFX11-TRUE16: ; %bb.0:
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3c00
-; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3c00
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v3, v1, v2 op_sel_hi:[1,1,1]
; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
@@ -137,13 +137,22 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 {
-; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v4, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
; GFX9: ; %bb.0:
@@ -172,6 +181,14 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
; SDAG-CI-NEXT: v_mov_b32_e32 v0, v3
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -499,14 +516,25 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 {
-; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v3, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v0, v3, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v1, off dlc
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v3, off dlc
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
; GFX9: ; %bb.0:
@@ -542,6 +570,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
; SDAG-CI-NEXT: s_waitcnt vmcnt(0)
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc
+; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index a4878539b1c74..95df131e21358 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -2253,9 +2253,10 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v2.l, 0x8000, v0.h
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v2|, v1, v0 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
More information about the llvm-commits
mailing list