[llvm] [AMDGPU][True16][CodeGen] optimize codengen for mad-mix in true16 (PR #124995)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 27 10:30:45 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
---
Patch is 31.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124995.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+8)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll (+58-105)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix.ll (+43-122)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8e90754103ff1..57cf206a025ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3651,6 +3651,10 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
// TODO: Should we try to look for neg/abs here?
}
+ // Prevent unnecessary subreg COPY to VGPR_16
+ if (Subtarget->useRealTrue16Insts() && Src.getOpcode() == ISD::TRUNCATE) {
+ Src = Src.getOperand(0);
+ }
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a787c10a9421c..479caec2aa628 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5841,6 +5841,14 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
CheckAbsNeg();
}
+ // Since we looked through FPEXT and removed it, we must also remove
+ // G_TRUNC. G_TRUNC to 16-bits would have a destination in RC VGPR_16, which
+ // is not compatible with MadMix instructions
+ Register PeekSrc = Src;
+ if (Subtarget->useRealTrue16Insts() &&
+ mi_match(PeekSrc, *MRI, m_GTrunc(m_Reg(PeekSrc))))
+ Src = PeekSrc;
+
Matched = true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 9949b823dfec1..884ebd9c0ea08 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -424,10 +424,11 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v4, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32:
@@ -546,11 +547,12 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v7, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v6
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32:
@@ -716,14 +718,14 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v1.h, v7.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v8, v2, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v6, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32:
@@ -924,27 +926,14 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
; FIXME (DAG): Fold clamp
define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt:
-; SDAG-GFX1100-TRUE16: ; %bb.0:
-; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
-; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt:
-; SDAG-GFX1100-FAKE16: ; %bb.0:
-; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3
-; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt:
; GFX900: ; %bb.0:
@@ -1001,15 +990,6 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt:
-; GISEL-GFX1100: ; %bb.0:
-; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
-;
; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1073,17 +1053,14 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v2, v3, v5, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, 0
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.h, v6.l
+; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, 0
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v0, v2, v2 clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v0, v3, v3 clamp
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_postcvt:
@@ -1287,19 +1264,17 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.h, v6.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v7.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v6, v7, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v2, v2 clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v0, v6, v6 clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v7, v7 clamp
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_postcvt:
@@ -1542,14 +1517,11 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v3.l, v3.l, v3.l clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1712,13 +1684,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1888,15 +1856,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_precvt:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v3, v4, v5 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v3
; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -2049,18 +2014,13 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32_clamp_precvt:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v6, v7, v8 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v1, v3, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v6
; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1
; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
@@ -2248,20 +2208,13 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_precvt:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v9.l, v2.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v3, v1, v3, v5 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v8, v9, v10 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v3, v6, v7, v11 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v6
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v7
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v3
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index 4c2a16c17b38a..232b9eb063df9 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -198,26 +198,14 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %
}
define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32:
-; SDAG-GFX1100-TRUE16: ; %bb.0:
-; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/124995
More information about the llvm-commits
mailing list