[llvm] [LLVM] Combine v_cvt_f32_f16 and v_add_f32/v_mul_f32 into v_fma_mix_f32 (PR #160151)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 22 09:51:18 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Acim Maravic (Acim-Maravic)
<details>
<summary>Changes</summary>
---
Patch is 377.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160151.diff
14 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+18)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+6)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+2)
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+25)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll (+46-84)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll (+21-32)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll (+579-629)
- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+14-18)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+17-20)
- (modified) llvm/test/CodeGen/AMDGPU/fpext-free.ll (+76-92)
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+162-184)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+136-190)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll (+643-7)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix.ll (+2926-69)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index d4210b8bc9a87..9654a6e1fbd5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -4127,6 +4127,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsNeg(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
+ Mods ^= SISrcMods::NEG;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsNeg(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
+ Mods ^= SISrcMods::NEG;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
// Match BITOP3 operation and return a number of matched instructions plus
// truth table.
static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 4fa0d3f72e1c7..b122b5cd310b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -272,6 +272,12 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
SDValue &SrcMods) const;
+ bool SelectVOP3PMadMixModsNeg(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+
+ bool SelectVOP3PMadMixBF16ModsNeg(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index fb2cd04b364d7..af6d4ff319fd9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1704,6 +1704,8 @@ def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">;
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">;
def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">;
+def VOP3PMadMixModsNeg : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsNeg">;
+def VOP3PMadMixBF16ModsNeg : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsNeg">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index f7279b664ed27..5ea8dbe9a1b7f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -168,6 +168,7 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
ValueType vecVT = v2f16> {
defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
+ defvar VOP3PMadMixModsNegPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsNeg, VOP3PMadMixModsNeg);
// At least one of the operands needs to be an fpextend of an f16
// for this to be worthwhile, so we need three patterns here.
// TODO: Could we use a predicate to inspect src1/2/3 instead?
@@ -190,6 +191,30 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
+ def : GCNPat <
+ (f32 (fadd (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
+ (mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1,
+ DSTCLAMP.NONE)>;
+
+ def : GCNPat <
+ (f32 (fmul (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
+ (mix_inst $src0_mods, $src0, $src1_mods, $src1, (i32 0), (i32 0),
+ DSTCLAMP.NONE)>;
+
+ def : GCNPat <
+ (f32 (fsub (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsNegPat f32:$src1, i32:$src1_mods)))),
+ (mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1,
+ DSTCLAMP.NONE)>;
+
+ def : GCNPat <
+ (f32 (fsub (f32 (VOP3PMadMixModsNegPat f32:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)))),
+ (mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1,
+ DSTCLAMP.NONE)>;
+
def : GCNPat <
(AMDGPUclamp (build_vector
(VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
index b2b433167fe4d..f9b63ef8e96e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
@@ -53,16 +53,14 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul(half %x, half %y, float %z,
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-NEXT: v_fmac_f16_e32 v3, v0, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_fma_mix_f32 v0, v3, 1, v2 op_sel_hi:[1,0,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_f16_f32_add_ext_fma_mul:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v3, v0, v1
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v3, 1, v2 op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_f16_f32_add_ext_fma_mul:
@@ -70,8 +68,7 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul(half %x, half %y, float %z,
; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v3
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, 1, v2 op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul half %u, %v
@@ -129,16 +126,14 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul_rhs(float %x, half %y, half
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v3
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_f16_f32_add_ext_fma_mul_rhs:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v1, v3
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_f16_f32_add_ext_fma_mul_rhs:
@@ -146,8 +141,7 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul_rhs(float %x, half %y, half
; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-DENORM-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-DENORM-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v1, 1, v0 op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul half %u, %v
@@ -230,48 +224,36 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul(<4 x half> %x, <4
; GFX9-DENORM-NEXT: v_pk_mul_f16 v9, v9, v11
; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v0, v8
-; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v9
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v2, v4
-; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v3, v5
-; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v8, v6
-; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v9, v7
+; GFX9-DENORM-NEXT: v_pk_add_f16 v2, v0, v8
+; GFX9-DENORM-NEXT: v_pk_add_f16 v3, v1, v9
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, 1, v4 op_sel_hi:[1,0,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v2, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, 1, v6 op_sel_hi:[1,0,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v3, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: v_pk_mul_f16 v8, v8, v10
; GFX10-NEXT: v_pk_mul_f16 v9, v9, v11
-; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v8
-; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v9
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v8, v1
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_add_f32_e32 v0, v2, v4
-; GFX10-NEXT: v_add_f32_e32 v1, v3, v5
-; GFX10-NEXT: v_add_f32_e32 v2, v8, v6
-; GFX10-NEXT: v_add_f32_e32 v3, v9, v7
+; GFX10-NEXT: v_pk_fma_f16 v2, v0, v2, v8
+; GFX10-NEXT: v_pk_fma_f16 v3, v1, v3, v9
+; GFX10-NEXT: v_fma_mix_f32 v0, v2, 1, v4 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mix_f32 v1, v2, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mix_f32 v2, v3, 1, v6 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mix_f32 v3, v3, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v8, v8, v10
; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v9, v9, v11
-; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v8
-; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v9
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v8, v1
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v2, v4
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v3, v5
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v8, v6
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v9, v7
+; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v2, v0, v2, v8
+; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v3, v1, v3, v9
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, 1, v4 op_sel_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v2, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v3, 1, v6 op_sel_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v3, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
@@ -280,16 +262,12 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul(<4 x half> %x, <4
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v9, v11
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v8
-; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v2
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v2, v4
-; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v3, v5
-; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v8, v6
-; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v9, v7
+; GFX10-DENORM-NEXT: v_pk_add_f16 v3, v0, v8
+; GFX10-DENORM-NEXT: v_pk_add_f16 v8, v1, v2
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v3, 1, v4 op_sel_hi:[1,0,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v3, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v8, 1, v6 op_sel_hi:[1,0,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v8, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul <4 x half> %u, %v
@@ -374,14 +352,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %
; GFX9-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7
; GFX9-DENORM-NEXT: v_pk_add_f16 v4, v4, v8
; GFX9-DENORM-NEXT: v_pk_add_f16 v5, v5, v9
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v6
-; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
@@ -390,14 +364,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %
; GFX10-NEXT: v_pk_mul_f16 v9, v9, v11
; GFX10-NEXT: v_pk_fma_f16 v4, v4, v6, v8
; GFX10-NEXT: v_pk_fma_f16 v5, v5, v7, v9
-; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v4
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v6
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX10-NEXT: v_fma_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
@@ -406,14 +376,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %
; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v9, v9, v11
; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v4, v4, v6, v8
; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v5, v5, v7, v9
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v6, v4
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v7, v5
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v6
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
@@ -424,14 +390,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %
; GFX10-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7
; GFX10-DENORM-NEXT: v_pk_add_f16 v4, v4, v8
; GFX10-DENORM-NEXT: v_pk_add_f16 v5, v5, v6
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v6
-; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul <4 x half> %u, %v
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
index 4d603f7487754..26f8e41c9351a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
@@ -49,21 +49,16 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
; GFX9-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
; GFX9-FAST-DENORM: ; %bb.0: ; %.entry
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0
-; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1
-; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7
+; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s0, v0
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v3, s1, v0
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v4, s2, v0
+; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v0, v1, 1, s6 op_sel_hi:[1,0,0]
+; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v1, v1, 1, s7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v2, v3, 1, s8 op_sel_hi:[1,0,0]
+; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v3, v3, 1, s9 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v4, v4, 1, s10 op_sel_hi:[1,0,0]
; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-FAST-DENORM-LABEL: test_5x...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/160151
More information about the llvm-commits
mailing list