[llvm] [CodeGen] [AMDGPU] Attempt DAGCombine for fmul with select to ldexp (PR #111109)

Tue Oct 8 05:01:52 PDT 2024

https://github.com/vg0204 updated https://github.com/llvm/llvm-project/pull/111109

>From 4b0885d85487ce6682493276e1d00e6be1b2848b Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Wed, 2 Oct 2024 09:18:24 +0000
Subject: [PATCH 1/4] [CodeGen] [AMDGPU] Attempt DAGCombine for fmul with
 select to ldexp

For the f32/f16, this combine does no improvements, but for f64 this
specific case of fmul with select is more costly to materialize as
compared to ldexp, so the following dag combine does the magic.

fmul x, select(y, 2.0, 1.0) -> ldexp x, zext(i1 y)
fmul x, selcet(y, 0.5, 1.0) -> ldexp x, sext(i1 y)

Thus, it solves the issue #104900.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    |  54 ++++
 llvm/lib/Target/AMDGPU/SIISelLowering.h      |   1 +
 llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll | 280 +++++++++++++++++++
 3 files changed, 335 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5e4cf705cc9e47..1456e7b3a20339 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -899,6 +899,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        ISD::FADD,
                        ISD::FSUB,
                        ISD::FDIV,
+                       ISD::FMUL,
                        ISD::FMINNUM,
                        ISD::FMAXNUM,
                        ISD::FMINNUM_IEEE,
@@ -14476,6 +14477,57 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N,
   return SDValue();
 }
 
+SDValue SITargetLowering::performFMulCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  SDLoc SL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // ldexp(x, zext(i1 y)) -> fmul x, (select y, 2.0, 1.0)
+  // ldexp(x, sext(i1 y)) -> fmul x, (select y, 0.5, 1.0)
+  //
+  // The above mentioned ldexp folding works fine for
+  // f16/f32, but as for f64 it creates f64 select which
+  // is costly to materealize as compared to f64 ldexp
+  // so here we undo the transform for f64 as follows :
+  //
+  // fmul x, (select y, 2.0, 1.0) -> ldexp(x, zext(i1 y))
+  // fmul x, (select y, 0.5, 1.0) -> ldexp(x, sext(i1 y))
+  // TODO : Need to handle vector of f64 type.
+  if (VT == MVT::f64) {
+    if (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT) {
+      const ConstantFPSDNode *TrueNode =
+          dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
+      const ConstantFPSDNode *FalseNode =
+          dyn_cast<ConstantFPSDNode>(RHS.getOperand(2));
+
+      if (!TrueNode || !FalseNode)
+        return SDValue();
+
+      const double TrueVal = TrueNode->getValueAPF().convertToDouble();
+      const double FalseVal = FalseNode->getValueAPF().convertToDouble();
+      unsigned ExtOp;
+
+      if (FalseVal == 1.0) {
+        if (TrueVal == 2.0)
+          ExtOp = ISD::ZERO_EXTEND;
+        else if (TrueVal == 0.5)
+          ExtOp = ISD::SIGN_EXTEND;
+        else
+          return SDValue();
+
+        SDValue ExtNode = DAG.getNode(ExtOp, SL, MVT::i32, RHS.getOperand(0));
+        return DAG.getNode(ISD::FLDEXP, SL, MVT::f64, LHS, ExtNode);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue SITargetLowering::performFMACombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -14765,6 +14817,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performFSubCombine(N, DCI);
   case ISD::FDIV:
     return performFDivCombine(N, DCI);
+  case ISD::FMUL:
+    return performFMulCombine(N, DCI);
   case ISD::SETCC:
     return performSetCCCombine(N, DCI);
   case ISD::FMAXNUM:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 6c3edf37945e24..1ead2e4fc916bb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -218,6 +218,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFDivCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll
new file mode 100644
index 00000000000000..0748aa0a0abec0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll
@@ -0,0 +1,280 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
+;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs | FileCheck -check-prefix=GFX1030 %s
+;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
+
+define float @fmul_select_f32_test1(float %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f32_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select i1 %bool, float 2.000000e+00, float 1.000000e+00
+  %ldexp = fmul float %x, %1
+  ret float %ldexp
+}
+
+define float @fmul_select_f32_test2(float %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f32_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select i1 %bool, float 0.500000e+00, float 1.000000e+00
+  %ldexp = fmul float %x, %1
+  ret float %ldexp
+}
+
+define <2 x float> @fmul_select_v2f32_test1(<2 x float> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f32_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f32_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1030-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f32_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 1, v3
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select <2 x i1> %bool, <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+  %ldexp = fmul <2 x float> %x, %1
+  ret <2 x float> %ldexp
+}
+
+define <2 x float> @fmul_select_v2f32_test2(<2 x float> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f32_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f32_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1030-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f32_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 1, v3
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select <2 x i1> %bool, <2 x float> <float 0.500000e+00, float 0.500000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+  %ldexp = fmul <2 x float> %x, %1
+  ret <2 x float> %ldexp
+}
+
+define double @fmul_select_f64_test1(double %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f64_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select i1 %bool, double 2.000000e+00, double 1.000000e+00
+  %ldexp = fmul double %x, %1
+  ret double %ldexp
+}
+
+define double @fmul_select_f64_test2(double %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f64_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 1
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_bfe_i32 v2, v2, 0, 1
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_bfe_i32 v2, v2, 0, 1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select i1 %bool, double 0.500000e+00, double 1.000000e+00
+  %ldexp = fmul double %x, %1
+  ret double %ldexp
+}
+
+define <2 x double> @fmul_select_v2f64_test1(<2 x double> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f64_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v5
+; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f64_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX1030-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f64_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX1100-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
+  %ldexp = fmul <2 x double> %x, %1
+  ret <2 x double> %ldexp
+}
+
+define <2 x double> @fmul_select_v2f64_test2(<2 x double> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f64_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_bfe_i32 v4, v4, 0, 1
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX9-NEXT:    v_bfe_i32 v4, v5, 0, 1
+; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f64_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_bfe_i32 v4, v4, 0, 1
+; GFX1030-NEXT:    v_bfe_i32 v5, v5, 0, 1
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f64_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_bfe_i32 v4, v4, 0, 1
+; GFX1100-NEXT:    v_bfe_i32 v5, v5, 0, 1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select <2 x i1> %bool, <2 x double> <double 0.500000e+00, double 0.500000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
+  %ldexp = fmul <2 x double> %x, %1
+  ret <2 x double> %ldexp
+}

>From 90db0a0c1e59b8b14ac562e1b02677f0d26817dc Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Mon, 7 Oct 2024 12:00:22 +0000
Subject: [PATCH 2/4] Added support to handle negative constant case, &
 addressed the review suggestion.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 30 ++++++++++++++---------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1456e7b3a20339..329cdb877c6d15 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14494,32 +14494,40 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
   // is costly to materealize as compared to f64 ldexp
   // so here we undo the transform for f64 as follows :
   //
-  // fmul x, (select y, 2.0, 1.0) -> ldexp(x, zext(i1 y))
-  // fmul x, (select y, 0.5, 1.0) -> ldexp(x, sext(i1 y))
-  // TODO : Need to handle vector of f64 type.
+  // fmul x, (select y, 2.0, 1.0)   -> ldexp(  x, zext(i1 y) )
+  // fmul x, (select y, -2.0, -1.0) -> ldexp( (fneg x), zext(i1 y) )
+  // fmul x, (select y, 0.5, 1.0)   -> ldexp(  x, sext(i1 y) )
+  // fmul x, (select y, -0.5, -1.0) -> ldexp( (fneg x), sext(i1 y) )
   if (VT == MVT::f64) {
     if (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT) {
       const ConstantFPSDNode *TrueNode =
-          dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
+          isConstOrConstSplatFP(RHS.getOperand(1));
       const ConstantFPSDNode *FalseNode =
-          dyn_cast<ConstantFPSDNode>(RHS.getOperand(2));
+          isConstOrConstSplatFP(RHS.getOperand(2));
+      bool isNeg;
 
       if (!TrueNode || !FalseNode)
         return SDValue();
 
-      const double TrueVal = TrueNode->getValueAPF().convertToDouble();
-      const double FalseVal = FalseNode->getValueAPF().convertToDouble();
-      unsigned ExtOp;
+      if (TrueNode->isNegative() && FalseNode->isNegative())
+        isNeg = true;
+      else if (!TrueNode->isNegative() && !FalseNode->isNegative())
+        isNeg = false;
+      else
+        return SDValue();
 
-      if (FalseVal == 1.0) {
-        if (TrueVal == 2.0)
+      unsigned ExtOp;
+      if (FalseNode->isExactlyValue(1.0) || FalseNode->isExactlyValue(-1.0)) {
+        if (TrueNode->isExactlyValue(2.0) || TrueNode->isExactlyValue(-2.0))
           ExtOp = ISD::ZERO_EXTEND;
-        else if (TrueVal == 0.5)
+        else if (TrueNode->isExactlyValue(0.5) ||
+                 TrueNode->isExactlyValue(-0.5))
           ExtOp = ISD::SIGN_EXTEND;
         else
           return SDValue();
 
         SDValue ExtNode = DAG.getNode(ExtOp, SL, MVT::i32, RHS.getOperand(0));
+        LHS = isNeg ? DAG.getNode(ISD::FNEG, SL, VT, LHS) : LHS;
         return DAG.getNode(ISD::FLDEXP, SL, MVT::f64, LHS, ExtNode);
       }
     }

>From 80b3b73bbb1686cc0bc02570e2705d37f9102b80 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 8 Oct 2024 08:42:23 +0000
Subject: [PATCH 3/4] Rectified the handling of vector types, along with some
 suggested changes

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 329cdb877c6d15..6d10dc161f1806 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14498,23 +14498,19 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
   // fmul x, (select y, -2.0, -1.0) -> ldexp( (fneg x), zext(i1 y) )
   // fmul x, (select y, 0.5, 1.0)   -> ldexp(  x, sext(i1 y) )
   // fmul x, (select y, -0.5, -1.0) -> ldexp( (fneg x), sext(i1 y) )
-  if (VT == MVT::f64) {
+  if (VT.getScalarType() == MVT::f64) {
     if (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT) {
       const ConstantFPSDNode *TrueNode =
           isConstOrConstSplatFP(RHS.getOperand(1));
       const ConstantFPSDNode *FalseNode =
           isConstOrConstSplatFP(RHS.getOperand(2));
-      bool isNeg;
 
       if (!TrueNode || !FalseNode)
         return SDValue();
 
-      if (TrueNode->isNegative() && FalseNode->isNegative())
-        isNeg = true;
-      else if (!TrueNode->isNegative() && !FalseNode->isNegative())
-        isNeg = false;
-      else
+      if (TrueNode->isNegative() != FalseNode->isNegative())
         return SDValue();
+      bool isNeg = TrueNode->isNegative();
 
       unsigned ExtOp;
       if (FalseNode->isExactlyValue(1.0) || FalseNode->isExactlyValue(-1.0)) {
@@ -14526,9 +14522,13 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
         else
           return SDValue();
 
-        SDValue ExtNode = DAG.getNode(ExtOp, SL, MVT::i32, RHS.getOperand(0));
+        EVT ExtVT = VT.isVector()
+                        ? EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                           VT.getVectorNumElements())
+                        : MVT::i32;
+        SDValue ExtNode = DAG.getNode(ExtOp, SL, ExtVT, RHS.getOperand(0));
         LHS = isNeg ? DAG.getNode(ISD::FNEG, SL, VT, LHS) : LHS;
-        return DAG.getNode(ISD::FLDEXP, SL, MVT::f64, LHS, ExtNode);
+        return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, ExtNode);
       }
     }
   }

>From 097c751c26e810ce199b9a5f5058a6d3bf6dc83b Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 8 Oct 2024 12:01:28 +0000
Subject: [PATCH 4/4] Generalized the dagCombine to handle any pairs of select
 which are exact exponent of 2.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 51 +++++++++++++----------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6d10dc161f1806..af847f913aa71c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14481,6 +14481,7 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
+  EVT i32VT = VT.changeElementType(MVT::i32);
 
   SDLoc SL(N);
   SDValue LHS = N->getOperand(0);
@@ -14492,12 +14493,7 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
   // The above mentioned ldexp folding works fine for
   // f16/f32, but as for f64 it creates f64 select which
   // is costly to materealize as compared to f64 ldexp
-  // so here we undo the transform for f64 as follows :
-  //
-  // fmul x, (select y, 2.0, 1.0)   -> ldexp(  x, zext(i1 y) )
-  // fmul x, (select y, -2.0, -1.0) -> ldexp( (fneg x), zext(i1 y) )
-  // fmul x, (select y, 0.5, 1.0)   -> ldexp(  x, sext(i1 y) )
-  // fmul x, (select y, -0.5, -1.0) -> ldexp( (fneg x), sext(i1 y) )
+  // so here we undo the transform for f64 datatype.
   if (VT.getScalarType() == MVT::f64) {
     if (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT) {
       const ConstantFPSDNode *TrueNode =
@@ -14510,25 +14506,36 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
 
       if (TrueNode->isNegative() != FalseNode->isNegative())
         return SDValue();
-      bool isNeg = TrueNode->isNegative();
+      LHS = TrueNode->isNegative() ? DAG.getNode(ISD::FNEG, SL, VT, LHS) : LHS;
 
-      unsigned ExtOp;
+      // fmul x, (select y, 2.0, 1.0)   -> ldexp(  x, zext(i1 y) )
+      // fmul x, (select y, -2.0, -1.0) -> ldexp( (fneg x), zext(i1 y) )
+      // fmul x, (select y, 0.5, 1.0)   -> ldexp(  x, sext(i1 y) )
+      // fmul x, (select y, -0.5, -1.0) -> ldexp( (fneg x), sext(i1 y) )
       if (FalseNode->isExactlyValue(1.0) || FalseNode->isExactlyValue(-1.0)) {
-        if (TrueNode->isExactlyValue(2.0) || TrueNode->isExactlyValue(-2.0))
-          ExtOp = ISD::ZERO_EXTEND;
-        else if (TrueNode->isExactlyValue(0.5) ||
-                 TrueNode->isExactlyValue(-0.5))
-          ExtOp = ISD::SIGN_EXTEND;
-        else
-          return SDValue();
+        if (TrueNode->isExactlyValue(2.0) || TrueNode->isExactlyValue(-2.0)) {
+          SDValue ZExtNode =
+              DAG.getNode(ISD::ZERO_EXTEND, SL, i32VT, RHS.getOperand(0));
+          return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, ZExtNode);
+        } else if (TrueNode->isExactlyValue(0.5) ||
+                   TrueNode->isExactlyValue(-0.5)) {
+          SDValue SExtNode =
+              DAG.getNode(ISD::SIGN_EXTEND, SL, i32VT, RHS.getOperand(0));
+          return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SExtNode);
+        }
+      }
 
-        EVT ExtVT = VT.isVector()
-                        ? EVT::getVectorVT(*DAG.getContext(), MVT::i32,
-                                           VT.getVectorNumElements())
-                        : MVT::i32;
-        SDValue ExtNode = DAG.getNode(ExtOp, SL, ExtVT, RHS.getOperand(0));
-        LHS = isNeg ? DAG.getNode(ISD::FNEG, SL, VT, LHS) : LHS;
-        return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, ExtNode);
+      // Given : A = 2^a  &  B = 2^b ; where a and b are integers.
+      // fmul x, (select y, A, B)     -> ldexp( x, (select i32 y, a, b) )
+      // fmul x, (select y, -A, -B)   -> ldexp( (fneg x), (select i32 y, a, b) )
+      int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
+      int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
+      if (TrueNodeExpVal != INT_MIN && FalseNodeExpVal != INT_MIN) {
+        SDValue SelectNode =
+            DAG.getNode(ISD::SELECT, SL, i32VT, RHS.getOperand(0),
+                        DAG.getConstant(TrueNodeExpVal, SL, i32VT),
+                        DAG.getConstant(FalseNodeExpVal, SL, i32VT));
+        return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode);
       }
     }
   }