[llvm] [SelectionDAG] Flags are dropped when creating FMUL (PR #66646)

Mon Sep 18 06:43:15 PDT 2023

https://github.com/srpande created https://github.com/llvm/llvm-project/pull/66646

	While simplifying some vector operators in DAG combine, we may
	need to regenerate operators for simplified vectors. At that time,
	we need to make sure that all the flags of the new operator
	are copied/modified.

	Here's an example where "contract" flag is dropped when FMUL
	is createted.

	Replacing.2 t42: v2f32 = fmul contract t41, t38
	With: t48: v2f32 = fmul t38, t38



>From 61487b3c48d1af14dd0d3e4dba7e5bc0076544da Mon Sep 17 00:00:00 2001
From: Sirish Pande <sirish.pande at amd.com>
Date: Fri, 15 Sep 2023 13:01:09 -0500
Subject: [PATCH] [SelectionDAG] Flags are dropped when creating a new FMUL

While simplifying some vector operators in DAG combine, we may
need to regenerate operators for simplified vectors. At that time,
we need to make sure that all the flags of the new operator
are copied/modified.

Here's an example where "contract" flag is dropped when FMUL
is createted.

Replacing.2 t42: v2f32 = fmul contract t41, t38
With: t48: v2f32 = fmul t38, t38
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  5 ++--
 llvm/test/CodeGen/AMDGPU/fma.ll               | 23 +++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 23c1486f711d727..608bd9427b2a40f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2990,8 +2990,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     SDValue NewOp1 = SimplifyMultipleUseDemandedVectorElts(Op1, DemandedElts,
                                                            TLO.DAG, Depth + 1);
     if (NewOp0 || NewOp1) {
-      SDValue NewOp = TLO.DAG.getNode(
-          Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0, NewOp1 ? NewOp1 : Op1);
+      SDValue NewOp =
+          TLO.DAG.getNode(Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0,
+                          NewOp1 ? NewOp1 : Op1, Op->getFlags());
       return TLO.CombineTo(Op, NewOp);
     }
     return false;
diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll
index b1db04a7fd8863a..db221aedf754cba 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.ll
@@ -154,3 +154,26 @@ define float @fold_fmul_distributive(float %x, float %y) {
   %fmul = fmul contract float %fadd, %x
   ret float %fmul
 }
+
+; test to make sure contract is not dropped such that we can generate fma from following sequence.
+define amdgpu_kernel void @vec_mul_scalar_add_fma(<2 x float> %a, <2 x float> %b, float %c1, ptr addrspace(1) %inptr) {
+; GFX906-LABEL: vec_mul_scalar_add_fma:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_load_dword s8, s[0:1], 0x34
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v1, s8
+; GFX906-NEXT:    v_mov_b32_e32 v2, s6
+; GFX906-NEXT:    v_fmac_f32_e32 v1, s4, v2
+; GFX906-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
+; GFX906-NEXT:    s_endpgm
+  %gep = getelementptr float, ptr addrspace(1) %inptr, i32 1
+  %c = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> zeroinitializer
+  %mul = fmul contract <2 x float> %c, %b
+  %elv = extractelement <2 x float> %mul, i64 0
+  %add = fadd contract float %elv, %c1
+  store float %add, ptr addrspace(1) %gep, align 4
+  ret void
+}