[llvm] [AMDGPU] Fix FMA Combine (PR #119217)

Mon Dec 9 07:36:36 PST 2024

https://github.com/piotrAMD updated https://github.com/llvm/llvm-project/pull/119217

>From 1ebcb75b2afd4b507310a0f59547b7dc02b44802 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Mon, 9 Dec 2024 15:16:37 +0100
Subject: [PATCH 1/2] [AMDGPU] Fix FMA Combine

Update the check in the FMA combine to check dot10-insts instead
of dot7-insts.

The target of the combine, v_dot2_f32_f16, is available only
if dot10-insts target feature is enabled.

The issue probably dates back to the change that split out dot10-insts
out of dot7-insts.

As far as I can see, this does not affect any current targets, but
if a future target has dot7-insts, but not dot10-insts that would
cause a crash ("cannot select") for the input ir in the test.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  2 +-
 llvm/test/CodeGen/AMDGPU/fma_combine.ll   | 70 +++++++++++++++++++++++
 2 files changed, 71 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fma_combine.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f89fe8faa600ba..8dfebd36a962e1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14696,7 +14696,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
   EVT VT = N->getValueType(0);
   SDLoc SL(N);
 
-  if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
+  if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
     return SDValue();
 
   // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
diff --git a/llvm/test/CodeGen/AMDGPU/fma_combine.ll b/llvm/test/CodeGen/AMDGPU/fma_combine.ll
new file mode 100644
index 00000000000000..0414018862d071
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fma_combine.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+dot7-insts,-dot10-insts,-dot5-insts" < %s | FileCheck %s -check-prefix=DISABLED
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+dot7-insts,+dot10-insts,-dot5-insts" < %s | FileCheck %s -check-prefix=ENABLED
+
+; Test that FMACombine is disabled for a target without dot10-insts, and enabled for a target with dot10-insts.
+
+define amdgpu_kernel void @func() {
+; DISABLED-LABEL: func:
+; DISABLED:       ; %bb.0: ; %bb
+; DISABLED-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; DISABLED-NEXT:    v_mov_b32_e32 v2, 0
+; DISABLED-NEXT:    s_mov_b32 vcc_lo, exec_lo
+; DISABLED-NEXT:  .LBB0_1: ; %main
+; DISABLED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; DISABLED-NEXT:    ds_load_b128 v[3:6], v1
+; DISABLED-NEXT:    ds_store_b32 v1, v0
+; DISABLED-NEXT:    s_waitcnt lgkmcnt(1)
+; DISABLED-NEXT:    v_fma_mix_f32 v2, v4, v3, v2 op_sel_hi:[1,1,0]
+; DISABLED-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; DISABLED-NEXT:    v_fma_mix_f32 v2, v4, v3, v2 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; DISABLED-NEXT:    v_add_f32_e32 v2, 0, v2
+; DISABLED-NEXT:    s_cbranch_vccnz .LBB0_1
+; DISABLED-NEXT:  ; %bb.2: ; %DummyReturnBlock
+; DISABLED-NEXT:    s_endpgm
+;
+; ENABLED-LABEL: func:
+; ENABLED:       ; %bb.0: ; %bb
+; ENABLED-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; ENABLED-NEXT:    v_mov_b32_e32 v2, 0
+; ENABLED-NEXT:    s_mov_b32 vcc_lo, exec_lo
+; ENABLED-NEXT:  .LBB0_1: ; %main
+; ENABLED-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ENABLED-NEXT:    ds_load_b128 v[3:6], v1
+; ENABLED-NEXT:    ds_store_b32 v1, v0
+; ENABLED-NEXT:    s_waitcnt lgkmcnt(1)
+; ENABLED-NEXT:    v_dot2_f32_f16 v2, v4, v3, v2
+; ENABLED-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; ENABLED-NEXT:    v_add_f32_e32 v2, 0, v2
+; ENABLED-NEXT:    s_cbranch_vccnz .LBB0_1
+; ENABLED-NEXT:  ; %bb.2: ; %DummyReturnBlock
+; ENABLED-NEXT:    s_endpgm
+bb:
+  br label %main
+
+main:                        ; preds = %main, %bb
+  %.sroa.3 = phi float [ 0.000000e+00, %bb ], [ %i17, %main ]
+  %.sroa.4 = phi float [ 0.000000e+00, %bb ], [ %.sroa.3, %main ]
+  %i = load <8 x half>, ptr addrspace(3) null, align 16
+  %i1 = bitcast <8 x half> %i to i128
+  %i2 = trunc i128 %i1 to i32
+  %i3 = bitcast i32 %i2 to <2 x half>
+  %.sroa.1 = extractelement <2 x half> %i3, i64 0
+  %.sroa.2 = extractelement <2 x half> %i3, i64 1
+  %i4 = shufflevector <8 x half> %i, <8 x half> zeroinitializer, <2 x i32> <i32 2, i32 3>
+  %i5 = fpext <2 x half> %i4 to <2 x float>
+  %i6 = fpext half %.sroa.1 to float
+  %i7 = fpext half %.sroa.2 to float
+  %i8 = extractelement <2 x float> %i5, i64 0
+  %i9 = fmul contract float %i8, %i6
+  %i10 = fadd contract float %.sroa.3, %i9
+  %i11 = extractelement <2 x float> %i5, i64 1
+  %i12 = fmul contract float %i11, %i7
+  %i13 = fadd contract float %i12, %i10
+  %i14 = insertelement <2 x float> zeroinitializer, float %i13, i64 0
+  %i15 = insertelement <2 x float> %i14, float %.sroa.4, i64 1
+  %i16 = fadd <2 x float> %i15, zeroinitializer
+  store <2 x half> zeroinitializer, ptr addrspace(3) null, align 4
+  %i17 = extractelement <2 x float> %i16, i64 0
+  br label %main
+}

>From 0f602eca96947995a0bca2b41287d36ed26814f6 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak at amd.com>
Date: Mon, 9 Dec 2024 16:33:37 +0100
Subject: [PATCH 2/2] Update fdot2.ll instead of adding new test

---
 llvm/test/CodeGen/AMDGPU/fdot2.ll       | 21 ++++++--
 llvm/test/CodeGen/AMDGPU/fma_combine.ll | 70 -------------------------
 2 files changed, 17 insertions(+), 74 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/fma_combine.ll

diff --git a/llvm/test/CodeGen/AMDGPU/fdot2.ll b/llvm/test/CodeGen/AMDGPU/fdot2.ll
index 695042d44d87b6..776816d6aa0e38 100644
--- a/llvm/test/CodeGen/AMDGPU/fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdot2.ll
@@ -5,6 +5,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-CONTRACT
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DENORM-CONTRACT
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DOT10-DISABLED
 ; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
 
 ; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
@@ -21,6 +22,7 @@
 
 ; GFX906-CONTRACT: v_mac_f16_e32
 ; GFX906-DENORM-CONTRACT: v_fma_f16
+; GFX906-DOT10-DISABLED: v_fma_f16
 define amdgpu_kernel void @dotproduct_f16(ptr addrspace(1) %src1,
                                           ptr addrspace(1) %src2,
                                           ptr addrspace(1) nocapture %dst) {
@@ -44,8 +46,11 @@ entry:
 }
 
 
-; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32
-; and the vectors are of type <2 x half>
+; We only want to generate fdot2 if:
+; - vector element of dot product is converted from f16 to f32, and
+; - the vectors are of type <2 x half>, and
+; - "dot10-insts" is enabled
+
 ; GCN-LABEL: {{^}}dotproduct_f16_f32
 ; GFX900: v_mad_mix_f32
 ; GFX900: v_mad_mix_f32
@@ -59,6 +64,7 @@ entry:
 ; GFX906-CONTRACT: v_dot2_f32_f16
 
 ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
+; GFX906-DOT10-DISABLED: v_fma_mix_f32
 define amdgpu_kernel void @dotproduct_f16_f32(ptr addrspace(1) %src1,
                                               ptr addrspace(1) %src2,
                                               ptr addrspace(1) nocapture %dst) {
@@ -85,8 +91,11 @@ entry:
   ret void
 }
 
-; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32
-; and the vectors are of type <2 x half>
+; We only want to generate fdot2 if:
+; - vector element of dot product is converted from f16 to f32, and
+; - the vectors are of type <2 x half>, and
+; - "dot10-insts" is enabled
+
 ; GCN-LABEL: {{^}}dotproduct_diffvecorder
 ; GFX900: v_mad_mix_f32
 ; GFX900: v_mad_mix_f32
@@ -99,6 +108,7 @@ entry:
 
 ; GFX906-CONTRACT: v_dot2_f32_f16
 ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
+; GFX906-DOT10-DISABLED: v_fma_mix_f32
 define amdgpu_kernel void @dotproduct_diffvecorder(ptr addrspace(1) %src1,
                                                    ptr addrspace(1) %src2,
                                                    ptr addrspace(1) nocapture %dst) {
@@ -136,6 +146,7 @@ entry:
 
 ; GFX906-CONTRACT: v_fma_mix_f32
 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
+; GFX906-DOT10-DISABLED: v_fma_mix_f32
 define amdgpu_kernel void @dotproduct_v4f16(ptr addrspace(1) %src1,
                                             ptr addrspace(1) %src2,
                                             ptr addrspace(1) nocapture %dst) {
@@ -173,6 +184,7 @@ entry:
 
 ; GFX906-CONTRACT: v_fma_mix_f32
 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
+; GFX906-DOT10-DISABLED: v_fma_mix_f32
 define amdgpu_kernel void @NotAdotproduct(ptr addrspace(1) %src1,
                                           ptr addrspace(1) %src2,
                                           ptr addrspace(1) nocapture %dst) {
@@ -210,6 +222,7 @@ entry:
 
 ; GFX906-CONTRACT: v_fma_mix_f32
 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
+; GFX906-DOT10-DISABLED: v_fma_mix_f32
 define amdgpu_kernel void @Diff_Idx_NotAdotproduct(ptr addrspace(1) %src1,
                                                    ptr addrspace(1) %src2,
                                                    ptr addrspace(1) nocapture %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/fma_combine.ll b/llvm/test/CodeGen/AMDGPU/fma_combine.ll
deleted file mode 100644
index 0414018862d071..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/fma_combine.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+dot7-insts,-dot10-insts,-dot5-insts" < %s | FileCheck %s -check-prefix=DISABLED
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+dot7-insts,+dot10-insts,-dot5-insts" < %s | FileCheck %s -check-prefix=ENABLED
-
-; Test that FMACombine is disabled for a target without dot10-insts, and enabled for a target with dot10-insts.
-
-define amdgpu_kernel void @func() {
-; DISABLED-LABEL: func:
-; DISABLED:       ; %bb.0: ; %bb
-; DISABLED-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
-; DISABLED-NEXT:    v_mov_b32_e32 v2, 0
-; DISABLED-NEXT:    s_mov_b32 vcc_lo, exec_lo
-; DISABLED-NEXT:  .LBB0_1: ; %main
-; DISABLED-NEXT:    ; =>This Inner Loop Header: Depth=1
-; DISABLED-NEXT:    ds_load_b128 v[3:6], v1
-; DISABLED-NEXT:    ds_store_b32 v1, v0
-; DISABLED-NEXT:    s_waitcnt lgkmcnt(1)
-; DISABLED-NEXT:    v_fma_mix_f32 v2, v4, v3, v2 op_sel_hi:[1,1,0]
-; DISABLED-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; DISABLED-NEXT:    v_fma_mix_f32 v2, v4, v3, v2 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; DISABLED-NEXT:    v_add_f32_e32 v2, 0, v2
-; DISABLED-NEXT:    s_cbranch_vccnz .LBB0_1
-; DISABLED-NEXT:  ; %bb.2: ; %DummyReturnBlock
-; DISABLED-NEXT:    s_endpgm
-;
-; ENABLED-LABEL: func:
-; ENABLED:       ; %bb.0: ; %bb
-; ENABLED-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
-; ENABLED-NEXT:    v_mov_b32_e32 v2, 0
-; ENABLED-NEXT:    s_mov_b32 vcc_lo, exec_lo
-; ENABLED-NEXT:  .LBB0_1: ; %main
-; ENABLED-NEXT:    ; =>This Inner Loop Header: Depth=1
-; ENABLED-NEXT:    ds_load_b128 v[3:6], v1
-; ENABLED-NEXT:    ds_store_b32 v1, v0
-; ENABLED-NEXT:    s_waitcnt lgkmcnt(1)
-; ENABLED-NEXT:    v_dot2_f32_f16 v2, v4, v3, v2
-; ENABLED-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; ENABLED-NEXT:    v_add_f32_e32 v2, 0, v2
-; ENABLED-NEXT:    s_cbranch_vccnz .LBB0_1
-; ENABLED-NEXT:  ; %bb.2: ; %DummyReturnBlock
-; ENABLED-NEXT:    s_endpgm
-bb:
-  br label %main
-
-main:                        ; preds = %main, %bb
-  %.sroa.3 = phi float [ 0.000000e+00, %bb ], [ %i17, %main ]
-  %.sroa.4 = phi float [ 0.000000e+00, %bb ], [ %.sroa.3, %main ]
-  %i = load <8 x half>, ptr addrspace(3) null, align 16
-  %i1 = bitcast <8 x half> %i to i128
-  %i2 = trunc i128 %i1 to i32
-  %i3 = bitcast i32 %i2 to <2 x half>
-  %.sroa.1 = extractelement <2 x half> %i3, i64 0
-  %.sroa.2 = extractelement <2 x half> %i3, i64 1
-  %i4 = shufflevector <8 x half> %i, <8 x half> zeroinitializer, <2 x i32> <i32 2, i32 3>
-  %i5 = fpext <2 x half> %i4 to <2 x float>
-  %i6 = fpext half %.sroa.1 to float
-  %i7 = fpext half %.sroa.2 to float
-  %i8 = extractelement <2 x float> %i5, i64 0
-  %i9 = fmul contract float %i8, %i6
-  %i10 = fadd contract float %.sroa.3, %i9
-  %i11 = extractelement <2 x float> %i5, i64 1
-  %i12 = fmul contract float %i11, %i7
-  %i13 = fadd contract float %i12, %i10
-  %i14 = insertelement <2 x float> zeroinitializer, float %i13, i64 0
-  %i15 = insertelement <2 x float> %i14, float %.sroa.4, i64 1
-  %i16 = fadd <2 x float> %i15, zeroinitializer
-  store <2 x half> zeroinitializer, ptr addrspace(3) null, align 4
-  %i17 = extractelement <2 x float> %i16, i64 0
-  br label %main
-}