[clang-tools-extra] [SelectionDAG] Flags are dropped when creating a new FMUL (PR #66701)

Wed Sep 20 09:40:57 PDT 2023

https://github.com/srpande updated https://github.com/llvm/llvm-project/pull/66701

>From a9fe01d82743879c41982aa170fef517dee99256 Mon Sep 17 00:00:00 2001
From: Sirish Pande <sirish.pande at amd.com>
Date: Fri, 15 Sep 2023 13:01:09 -0500
Subject: [PATCH] [SelectionDAG] Flags are dropped when creating a new FMUL

While simplifying some vector operators in DAG combine, we may
need to create new instructions for simplified vectors. At that time,
we need to make sure that all the flags of the new instruction
are copied/modified from the old instruction.

Here's an example where "contract" flag is dropped when FMUL
is createted.

Replacing.2 t42: v2f32 = fmul contract t41, t38
With: t48: v2f32 = fmul t38, t38
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  5 ++--
 llvm/test/CodeGen/AMDGPU/fma.ll               | 23 +++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 23c1486f711d727..608bd9427b2a40f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2990,8 +2990,9 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     SDValue NewOp1 = SimplifyMultipleUseDemandedVectorElts(Op1, DemandedElts,
                                                            TLO.DAG, Depth + 1);
     if (NewOp0 || NewOp1) {
-      SDValue NewOp = TLO.DAG.getNode(
-          Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0, NewOp1 ? NewOp1 : Op1);
+      SDValue NewOp =
+          TLO.DAG.getNode(Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0,
+                          NewOp1 ? NewOp1 : Op1, Op->getFlags());
       return TLO.CombineTo(Op, NewOp);
     }
     return false;
diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll
index b1db04a7fd8863a..db221aedf754cba 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.ll
@@ -154,3 +154,26 @@ define float @fold_fmul_distributive(float %x, float %y) {
   %fmul = fmul contract float %fadd, %x
   ret float %fmul
 }
+
+; test to make sure contract is not dropped such that we can generate fma from following sequence.
+define amdgpu_kernel void @vec_mul_scalar_add_fma(<2 x float> %a, <2 x float> %b, float %c1, ptr addrspace(1) %inptr) {
+; GFX906-LABEL: vec_mul_scalar_add_fma:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_load_dword s8, s[0:1], 0x34
+; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v1, s8
+; GFX906-NEXT:    v_mov_b32_e32 v2, s6
+; GFX906-NEXT:    v_fmac_f32_e32 v1, s4, v2
+; GFX906-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
+; GFX906-NEXT:    s_endpgm
+  %gep = getelementptr float, ptr addrspace(1) %inptr, i32 1
+  %c = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> zeroinitializer
+  %mul = fmul contract <2 x float> %c, %b
+  %elv = extractelement <2 x float> %mul, i64 0
+  %add = fadd contract float %elv, %c1
+  store float %add, ptr addrspace(1) %gep, align 4
+  ret void
+}