[llvm] [SelectionDAG] Flags are dropped when creating FMUL (PR #66646)

Mon Sep 18 09:44:50 PDT 2023

https://github.com/srpande updated https://github.com/llvm/llvm-project/pull/66646

>From 6f91d4d2b8eaa779b346a821b21edc73e62e85f5 Mon Sep 17 00:00:00 2001
From: Sirish Pande <sirish.pande at amd.com>
Date: Fri, 15 Sep 2023 13:01:09 -0500
Subject: [PATCH] [SelectionDAG] Flags are dropped when creating a new FMUL

While simplifying some vector operators in DAG combine, we may
need to regenerate instructions for simplified vectors. At that time,
we need to make sure that all the flags of the new operator
are copied/modified.

Here's an example where "contract" flag is dropped when FMUL
is createted.

Replacing.2 t42: v2f32 = fmul contract t41, t38
With: t48: v2f32 = fmul t38, t38
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  5 ++--
 llvm/test/CodeGen/AMDGPU/fma.ll               | 23 +++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 23c1486f711d727..608bd9427b2a40f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2990,8 +2990,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     SDValue NewOp1 = SimplifyMultipleUseDemandedVectorElts(Op1, DemandedElts,
                                                            TLO.DAG, Depth + 1);
     if (NewOp0 || NewOp1) {
-      SDValue NewOp = TLO.DAG.getNode(
-          Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0, NewOp1 ? NewOp1 : Op1);
+      SDValue NewOp =
+          TLO.DAG.getNode(Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0,
+                          NewOp1 ? NewOp1 : Op1, Op->getFlags());
       return TLO.CombineTo(Op, NewOp);
     }
     return false;
diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll
index b1db04a7fd8863a..db221aedf754cba 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.ll
@@ -154,3 +154,26 @@ define float @fold_fmul_distributive(float %x, float %y) {
   %fmul = fmul contract float %fadd, %x
   ret float %fmul
 }
+
+; test to make sure contract is not dropped such that we can generate fma from following sequence.
+define amdgpu_kernel void @vec_mul_scalar_add_fma(<2 x float> %a, <2 x float> %b, float %c1, ptr addrspace(1) %inptr) {
+; GFX906-LABEL: vec_mul_scalar_add_fma:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_load_dword s8, s[0:1], 0x34
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v1, s8
+; GFX906-NEXT:    v_mov_b32_e32 v2, s6
+; GFX906-NEXT:    v_fmac_f32_e32 v1, s4, v2
+; GFX906-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
+; GFX906-NEXT:    s_endpgm
+  %gep = getelementptr float, ptr addrspace(1) %inptr, i32 1
+  %c = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> zeroinitializer
+  %mul = fmul contract <2 x float> %c, %b
+  %elv = extractelement <2 x float> %mul, i64 0
+  %add = fadd contract float %elv, %c1
+  store float %add, ptr addrspace(1) %gep, align 4
+  ret void
+}