[llvm] [CodeGen] [AMDGPU] Attempt DAGCombine for fmul with select to ldexp (PR #111109)

Vikash Gupta via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 4 00:18:11 PST 2024


https://github.com/vg0204 updated https://github.com/llvm/llvm-project/pull/111109

>From 4b0885d85487ce6682493276e1d00e6be1b2848b Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Wed, 2 Oct 2024 09:18:24 +0000
Subject: [PATCH 1/8] [CodeGen] [AMDGPU] Attempt DAGCombine for fmul with
 select to ldexp

For the f32/f16, this combine does no improvements, but for f64 this
specific case of fmul with select is more costly to materialize as
compared to ldexp, so the following dag combine does the magic.

fmul x, select(y, 2.0, 1.0) -> ldexp x, zext(i1 y)
fmul x, selcet(y, 0.5, 1.0) -> ldexp x, sext(i1 y)

Thus, it solves the issue #104900.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    |  54 ++++
 llvm/lib/Target/AMDGPU/SIISelLowering.h      |   1 +
 llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll | 280 +++++++++++++++++++
 3 files changed, 335 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5e4cf705cc9e47..1456e7b3a20339 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -899,6 +899,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        ISD::FADD,
                        ISD::FSUB,
                        ISD::FDIV,
+                       ISD::FMUL,
                        ISD::FMINNUM,
                        ISD::FMAXNUM,
                        ISD::FMINNUM_IEEE,
@@ -14476,6 +14477,57 @@ SDValue SITargetLowering::performFDivCombine(SDNode *N,
   return SDValue();
 }
 
+SDValue SITargetLowering::performFMulCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  SDLoc SL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // ldexp(x, zext(i1 y)) -> fmul x, (select y, 2.0, 1.0)
+  // ldexp(x, sext(i1 y)) -> fmul x, (select y, 0.5, 1.0)
+  //
+  // The above mentioned ldexp folding works fine for
+  // f16/f32, but as for f64 it creates f64 select which
+  // is costly to materealize as compared to f64 ldexp
+  // so here we undo the transform for f64 as follows :
+  //
+  // fmul x, (select y, 2.0, 1.0) -> ldexp(x, zext(i1 y))
+  // fmul x, (select y, 0.5, 1.0) -> ldexp(x, sext(i1 y))
+  // TODO : Need to handle vector of f64 type.
+  if (VT == MVT::f64) {
+    if (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT) {
+      const ConstantFPSDNode *TrueNode =
+          dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
+      const ConstantFPSDNode *FalseNode =
+          dyn_cast<ConstantFPSDNode>(RHS.getOperand(2));
+
+      if (!TrueNode || !FalseNode)
+        return SDValue();
+
+      const double TrueVal = TrueNode->getValueAPF().convertToDouble();
+      const double FalseVal = FalseNode->getValueAPF().convertToDouble();
+      unsigned ExtOp;
+
+      if (FalseVal == 1.0) {
+        if (TrueVal == 2.0)
+          ExtOp = ISD::ZERO_EXTEND;
+        else if (TrueVal == 0.5)
+          ExtOp = ISD::SIGN_EXTEND;
+        else
+          return SDValue();
+
+        SDValue ExtNode = DAG.getNode(ExtOp, SL, MVT::i32, RHS.getOperand(0));
+        return DAG.getNode(ISD::FLDEXP, SL, MVT::f64, LHS, ExtNode);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue SITargetLowering::performFMACombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -14765,6 +14817,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performFSubCombine(N, DCI);
   case ISD::FDIV:
     return performFDivCombine(N, DCI);
+  case ISD::FMUL:
+    return performFMulCombine(N, DCI);
   case ISD::SETCC:
     return performSetCCCombine(N, DCI);
   case ISD::FMAXNUM:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 6c3edf37945e24..1ead2e4fc916bb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -218,6 +218,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFDivCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll
new file mode 100644
index 00000000000000..0748aa0a0abec0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll
@@ -0,0 +1,280 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
+;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs | FileCheck -check-prefix=GFX1030 %s
+;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
+
+define float @fmul_select_f32_test1(float %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f32_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select i1 %bool, float 2.000000e+00, float 1.000000e+00
+  %ldexp = fmul float %x, %1
+  ret float %ldexp
+}
+
+define float @fmul_select_f32_test2(float %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f32_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select i1 %bool, float 0.500000e+00, float 1.000000e+00
+  %ldexp = fmul float %x, %1
+  ret float %ldexp
+}
+
+define <2 x float> @fmul_select_v2f32_test1(<2 x float> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f32_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f32_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1030-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f32_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 1, v3
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select <2 x i1> %bool, <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+  %ldexp = fmul <2 x float> %x, %1
+  ret <2 x float> %ldexp
+}
+
+define <2 x float> @fmul_select_v2f32_test2(<2 x float> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f32_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f32_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1030-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f32_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 1, v3
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select <2 x i1> %bool, <2 x float> <float 0.500000e+00, float 0.500000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+  %ldexp = fmul <2 x float> %x, %1
+  ret <2 x float> %ldexp
+}
+
+define double @fmul_select_f64_test1(double %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f64_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select i1 %bool, double 2.000000e+00, double 1.000000e+00
+  %ldexp = fmul double %x, %1
+  ret double %ldexp
+}
+
+define double @fmul_select_f64_test2(double %x, i1 %bool) {
+; GFX9-LABEL: fmul_select_f64_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 1
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_bfe_i32 v2, v2, 0, 1
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_bfe_i32 v2, v2, 0, 1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select i1 %bool, double 0.500000e+00, double 1.000000e+00
+  %ldexp = fmul double %x, %1
+  ret double %ldexp
+}
+
+define <2 x double> @fmul_select_v2f64_test1(<2 x double> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f64_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 1, v5
+; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f64_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX1030-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f64_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX1100-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
+  %ldexp = fmul <2 x double> %x, %1
+  ret <2 x double> %ldexp
+}
+
+define <2 x double> @fmul_select_v2f64_test2(<2 x double> %x, <2 x i1> %bool) {
+; GFX9-LABEL: fmul_select_v2f64_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_bfe_i32 v4, v4, 0, 1
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX9-NEXT:    v_bfe_i32 v4, v5, 0, 1
+; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f64_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_bfe_i32 v4, v4, 0, 1
+; GFX1030-NEXT:    v_bfe_i32 v5, v5, 0, 1
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f64_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_bfe_i32 v4, v4, 0, 1
+; GFX1100-NEXT:    v_bfe_i32 v5, v5, 0, 1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %1 = select <2 x i1> %bool, <2 x double> <double 0.500000e+00, double 0.500000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
+  %ldexp = fmul <2 x double> %x, %1
+  ret <2 x double> %ldexp
+}

>From 90db0a0c1e59b8b14ac562e1b02677f0d26817dc Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Mon, 7 Oct 2024 12:00:22 +0000
Subject: [PATCH 2/8] Added support to handle negative constant case, &
 addressed the review suggestion.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 30 ++++++++++++++---------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1456e7b3a20339..329cdb877c6d15 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14494,32 +14494,40 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
   // is costly to materealize as compared to f64 ldexp
   // so here we undo the transform for f64 as follows :
   //
-  // fmul x, (select y, 2.0, 1.0) -> ldexp(x, zext(i1 y))
-  // fmul x, (select y, 0.5, 1.0) -> ldexp(x, sext(i1 y))
-  // TODO : Need to handle vector of f64 type.
+  // fmul x, (select y, 2.0, 1.0)   -> ldexp(  x, zext(i1 y) )
+  // fmul x, (select y, -2.0, -1.0) -> ldexp( (fneg x), zext(i1 y) )
+  // fmul x, (select y, 0.5, 1.0)   -> ldexp(  x, sext(i1 y) )
+  // fmul x, (select y, -0.5, -1.0) -> ldexp( (fneg x), sext(i1 y) )
   if (VT == MVT::f64) {
     if (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT) {
       const ConstantFPSDNode *TrueNode =
-          dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
+          isConstOrConstSplatFP(RHS.getOperand(1));
       const ConstantFPSDNode *FalseNode =
-          dyn_cast<ConstantFPSDNode>(RHS.getOperand(2));
+          isConstOrConstSplatFP(RHS.getOperand(2));
+      bool isNeg;
 
       if (!TrueNode || !FalseNode)
         return SDValue();
 
-      const double TrueVal = TrueNode->getValueAPF().convertToDouble();
-      const double FalseVal = FalseNode->getValueAPF().convertToDouble();
-      unsigned ExtOp;
+      if (TrueNode->isNegative() && FalseNode->isNegative())
+        isNeg = true;
+      else if (!TrueNode->isNegative() && !FalseNode->isNegative())
+        isNeg = false;
+      else
+        return SDValue();
 
-      if (FalseVal == 1.0) {
-        if (TrueVal == 2.0)
+      unsigned ExtOp;
+      if (FalseNode->isExactlyValue(1.0) || FalseNode->isExactlyValue(-1.0)) {
+        if (TrueNode->isExactlyValue(2.0) || TrueNode->isExactlyValue(-2.0))
           ExtOp = ISD::ZERO_EXTEND;
-        else if (TrueVal == 0.5)
+        else if (TrueNode->isExactlyValue(0.5) ||
+                 TrueNode->isExactlyValue(-0.5))
           ExtOp = ISD::SIGN_EXTEND;
         else
           return SDValue();
 
         SDValue ExtNode = DAG.getNode(ExtOp, SL, MVT::i32, RHS.getOperand(0));
+        LHS = isNeg ? DAG.getNode(ISD::FNEG, SL, VT, LHS) : LHS;
         return DAG.getNode(ISD::FLDEXP, SL, MVT::f64, LHS, ExtNode);
       }
     }

>From 80b3b73bbb1686cc0bc02570e2705d37f9102b80 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 8 Oct 2024 08:42:23 +0000
Subject: [PATCH 3/8] Rectified the handling of vector types, along with some
 suggested changes

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 329cdb877c6d15..6d10dc161f1806 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14498,23 +14498,19 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
   // fmul x, (select y, -2.0, -1.0) -> ldexp( (fneg x), zext(i1 y) )
   // fmul x, (select y, 0.5, 1.0)   -> ldexp(  x, sext(i1 y) )
   // fmul x, (select y, -0.5, -1.0) -> ldexp( (fneg x), sext(i1 y) )
-  if (VT == MVT::f64) {
+  if (VT.getScalarType() == MVT::f64) {
     if (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT) {
       const ConstantFPSDNode *TrueNode =
           isConstOrConstSplatFP(RHS.getOperand(1));
       const ConstantFPSDNode *FalseNode =
           isConstOrConstSplatFP(RHS.getOperand(2));
-      bool isNeg;
 
       if (!TrueNode || !FalseNode)
         return SDValue();
 
-      if (TrueNode->isNegative() && FalseNode->isNegative())
-        isNeg = true;
-      else if (!TrueNode->isNegative() && !FalseNode->isNegative())
-        isNeg = false;
-      else
+      if (TrueNode->isNegative() != FalseNode->isNegative())
         return SDValue();
+      bool isNeg = TrueNode->isNegative();
 
       unsigned ExtOp;
       if (FalseNode->isExactlyValue(1.0) || FalseNode->isExactlyValue(-1.0)) {
@@ -14526,9 +14522,13 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
         else
           return SDValue();
 
-        SDValue ExtNode = DAG.getNode(ExtOp, SL, MVT::i32, RHS.getOperand(0));
+        EVT ExtVT = VT.isVector()
+                        ? EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                           VT.getVectorNumElements())
+                        : MVT::i32;
+        SDValue ExtNode = DAG.getNode(ExtOp, SL, ExtVT, RHS.getOperand(0));
         LHS = isNeg ? DAG.getNode(ISD::FNEG, SL, VT, LHS) : LHS;
-        return DAG.getNode(ISD::FLDEXP, SL, MVT::f64, LHS, ExtNode);
+        return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, ExtNode);
       }
     }
   }

>From 097c751c26e810ce199b9a5f5058a6d3bf6dc83b Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 8 Oct 2024 12:01:28 +0000
Subject: [PATCH 4/8] Generalized the dagCombine to handle any pairs of select
 which are exact exponent of 2.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 51 +++++++++++++----------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 6d10dc161f1806..af847f913aa71c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14481,6 +14481,7 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
+  EVT i32VT = VT.changeElementType(MVT::i32);
 
   SDLoc SL(N);
   SDValue LHS = N->getOperand(0);
@@ -14492,12 +14493,7 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
   // The above mentioned ldexp folding works fine for
   // f16/f32, but as for f64 it creates f64 select which
   // is costly to materealize as compared to f64 ldexp
-  // so here we undo the transform for f64 as follows :
-  //
-  // fmul x, (select y, 2.0, 1.0)   -> ldexp(  x, zext(i1 y) )
-  // fmul x, (select y, -2.0, -1.0) -> ldexp( (fneg x), zext(i1 y) )
-  // fmul x, (select y, 0.5, 1.0)   -> ldexp(  x, sext(i1 y) )
-  // fmul x, (select y, -0.5, -1.0) -> ldexp( (fneg x), sext(i1 y) )
+  // so here we undo the transform for f64 datatype.
   if (VT.getScalarType() == MVT::f64) {
     if (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT) {
       const ConstantFPSDNode *TrueNode =
@@ -14510,25 +14506,36 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
 
       if (TrueNode->isNegative() != FalseNode->isNegative())
         return SDValue();
-      bool isNeg = TrueNode->isNegative();
+      LHS = TrueNode->isNegative() ? DAG.getNode(ISD::FNEG, SL, VT, LHS) : LHS;
 
-      unsigned ExtOp;
+      // fmul x, (select y, 2.0, 1.0)   -> ldexp(  x, zext(i1 y) )
+      // fmul x, (select y, -2.0, -1.0) -> ldexp( (fneg x), zext(i1 y) )
+      // fmul x, (select y, 0.5, 1.0)   -> ldexp(  x, sext(i1 y) )
+      // fmul x, (select y, -0.5, -1.0) -> ldexp( (fneg x), sext(i1 y) )
       if (FalseNode->isExactlyValue(1.0) || FalseNode->isExactlyValue(-1.0)) {
-        if (TrueNode->isExactlyValue(2.0) || TrueNode->isExactlyValue(-2.0))
-          ExtOp = ISD::ZERO_EXTEND;
-        else if (TrueNode->isExactlyValue(0.5) ||
-                 TrueNode->isExactlyValue(-0.5))
-          ExtOp = ISD::SIGN_EXTEND;
-        else
-          return SDValue();
+        if (TrueNode->isExactlyValue(2.0) || TrueNode->isExactlyValue(-2.0)) {
+          SDValue ZExtNode =
+              DAG.getNode(ISD::ZERO_EXTEND, SL, i32VT, RHS.getOperand(0));
+          return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, ZExtNode);
+        } else if (TrueNode->isExactlyValue(0.5) ||
+                   TrueNode->isExactlyValue(-0.5)) {
+          SDValue SExtNode =
+              DAG.getNode(ISD::SIGN_EXTEND, SL, i32VT, RHS.getOperand(0));
+          return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SExtNode);
+        }
+      }
 
-        EVT ExtVT = VT.isVector()
-                        ? EVT::getVectorVT(*DAG.getContext(), MVT::i32,
-                                           VT.getVectorNumElements())
-                        : MVT::i32;
-        SDValue ExtNode = DAG.getNode(ExtOp, SL, ExtVT, RHS.getOperand(0));
-        LHS = isNeg ? DAG.getNode(ISD::FNEG, SL, VT, LHS) : LHS;
-        return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, ExtNode);
+      // Given : A = 2^a  &  B = 2^b ; where a and b are integers.
+      // fmul x, (select y, A, B)     -> ldexp( x, (select i32 y, a, b) )
+      // fmul x, (select y, -A, -B)   -> ldexp( (fneg x), (select i32 y, a, b) )
+      int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
+      int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
+      if (TrueNodeExpVal != INT_MIN && FalseNodeExpVal != INT_MIN) {
+        SDValue SelectNode =
+            DAG.getNode(ISD::SELECT, SL, i32VT, RHS.getOperand(0),
+                        DAG.getConstant(TrueNodeExpVal, SL, i32VT),
+                        DAG.getConstant(FalseNodeExpVal, SL, i32VT));
+        return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode);
       }
     }
   }

>From de6e306f2695541d5d3908dec1c35bc8ea08c130 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 15 Oct 2024 06:30:27 +0000
Subject: [PATCH 5/8] Removed the special case handling & added bf16 support
 for this combine.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    |  34 +--
 llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll | 280 -------------------
 2 files changed, 11 insertions(+), 303 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index af847f913aa71c..dd7871a4fa7339 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14491,10 +14491,18 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
   // ldexp(x, sext(i1 y)) -> fmul x, (select y, 0.5, 1.0)
   //
   // The above mentioned ldexp folding works fine for
-  // f16/f32, but as for f64 it creates f64 select which
-  // is costly to materealize as compared to f64 ldexp
+  // bf16/f32, but as for f64 it creates f64 select which
+  // is costly to materialize as compared to f64 ldexp
   // so here we undo the transform for f64 datatype.
-  if (VT.getScalarType() == MVT::f64) {
+  // Also in case of f16, its cheaper to materialize inline
+  // 32 bit-constant (via ldexp use) rather than using fmul.
+  //
+  // Given : A = 2^a  &  B = 2^b ; where a and b are integers.
+  // fmul x, (select y, A, B)     -> ldexp( x, (select i32 y, a, b) )
+  // fmul x, (select y, -A, -B)   -> ldexp( (fneg x), (select i32 y, a, b) )
+  // Note : It takes care of generic scenario which covers undoing
+  // of special case(zext/sext) as mentioned.
+  if (VT.getScalarType() == MVT::f64 || VT.getScalarType() == MVT::f16) {
     if (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT) {
       const ConstantFPSDNode *TrueNode =
           isConstOrConstSplatFP(RHS.getOperand(1));
@@ -14508,26 +14516,6 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
         return SDValue();
       LHS = TrueNode->isNegative() ? DAG.getNode(ISD::FNEG, SL, VT, LHS) : LHS;
 
-      // fmul x, (select y, 2.0, 1.0)   -> ldexp(  x, zext(i1 y) )
-      // fmul x, (select y, -2.0, -1.0) -> ldexp( (fneg x), zext(i1 y) )
-      // fmul x, (select y, 0.5, 1.0)   -> ldexp(  x, sext(i1 y) )
-      // fmul x, (select y, -0.5, -1.0) -> ldexp( (fneg x), sext(i1 y) )
-      if (FalseNode->isExactlyValue(1.0) || FalseNode->isExactlyValue(-1.0)) {
-        if (TrueNode->isExactlyValue(2.0) || TrueNode->isExactlyValue(-2.0)) {
-          SDValue ZExtNode =
-              DAG.getNode(ISD::ZERO_EXTEND, SL, i32VT, RHS.getOperand(0));
-          return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, ZExtNode);
-        } else if (TrueNode->isExactlyValue(0.5) ||
-                   TrueNode->isExactlyValue(-0.5)) {
-          SDValue SExtNode =
-              DAG.getNode(ISD::SIGN_EXTEND, SL, i32VT, RHS.getOperand(0));
-          return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SExtNode);
-        }
-      }
-
-      // Given : A = 2^a  &  B = 2^b ; where a and b are integers.
-      // fmul x, (select y, A, B)     -> ldexp( x, (select i32 y, a, b) )
-      // fmul x, (select y, -A, -B)   -> ldexp( (fneg x), (select i32 y, a, b) )
       int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
       int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
       if (TrueNodeExpVal != INT_MIN && FalseNodeExpVal != INT_MIN) {
diff --git a/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll
deleted file mode 100644
index 0748aa0a0abec0..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/combine-fmul-sel.ll
+++ /dev/null
@@ -1,280 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
-;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs | FileCheck -check-prefix=GFX1030 %s
-;RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
-
-define float @fmul_select_f32_test1(float %x, i1 %bool) {
-; GFX9-LABEL: fmul_select_f32_test1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
-  %1 = select i1 %bool, float 2.000000e+00, float 1.000000e+00
-  %ldexp = fmul float %x, %1
-  ret float %ldexp
-}
-
-define float @fmul_select_f32_test2(float %x, i1 %bool) {
-; GFX9-LABEL: fmul_select_f32_test2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f32_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f32_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
-  %1 = select i1 %bool, float 0.500000e+00, float 1.000000e+00
-  %ldexp = fmul float %x, %1
-  ret float %ldexp
-}
-
-define <2 x float> @fmul_select_v2f32_test1(<2 x float> %x, <2 x i1> %bool) {
-; GFX9-LABEL: fmul_select_v2f32_test1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f32_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX1030-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f32_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 1, v3
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
-; GFX1100-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
-  %1 = select <2 x i1> %bool, <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
-  %ldexp = fmul <2 x float> %x, %1
-  ret <2 x float> %ldexp
-}
-
-define <2 x float> @fmul_select_v2f32_test2(<2 x float> %x, <2 x i1> %bool) {
-; GFX9-LABEL: fmul_select_v2f32_test2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f32_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX1030-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
-; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f32_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 1, v3
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
-; GFX1100-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
-  %1 = select <2 x i1> %bool, <2 x float> <float 0.500000e+00, float 0.500000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
-  %ldexp = fmul <2 x float> %x, %1
-  ret <2 x float> %ldexp
-}
-
-define double @fmul_select_f64_test1(double %x, i1 %bool) {
-; GFX9-LABEL: fmul_select_f64_test1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
-  %1 = select i1 %bool, double 2.000000e+00, double 1.000000e+00
-  %ldexp = fmul double %x, %1
-  ret double %ldexp
-}
-
-define double @fmul_select_f64_test2(double %x, i1 %bool) {
-; GFX9-LABEL: fmul_select_f64_test2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_f64_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_f64_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
-  %1 = select i1 %bool, double 0.500000e+00, double 1.000000e+00
-  %ldexp = fmul double %x, %1
-  ret double %ldexp
-}
-
-define <2 x double> @fmul_select_v2f64_test1(<2 x double> %x, <2 x i1> %bool) {
-; GFX9-LABEL: fmul_select_v2f64_test1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX9-NEXT:    v_and_b32_e32 v4, 1, v5
-; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f64_test1:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX1030-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX1100-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
-  %1 = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
-  %ldexp = fmul <2 x double> %x, %1
-  ret <2 x double> %ldexp
-}
-
-define <2 x double> @fmul_select_v2f64_test2(<2 x double> %x, <2 x i1> %bool) {
-; GFX9-LABEL: fmul_select_v2f64_test2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_bfe_i32 v4, v4, 0, 1
-; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX9-NEXT:    v_bfe_i32 v4, v5, 0, 1
-; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-LABEL: fmul_select_v2f64_test2:
-; GFX1030:       ; %bb.0:
-; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-NEXT:    v_bfe_i32 v4, v4, 0, 1
-; GFX1030-NEXT:    v_bfe_i32 v5, v5, 0, 1
-; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1030-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1100-LABEL: fmul_select_v2f64_test2:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_bfe_i32 v4, v4, 0, 1
-; GFX1100-NEXT:    v_bfe_i32 v5, v5, 0, 1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
-  %1 = select <2 x i1> %bool, <2 x double> <double 0.500000e+00, double 0.500000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
-  %ldexp = fmul <2 x double> %x, %1
-  ret <2 x double> %ldexp
-}

>From b70fdea90de61c404d2f1e4ad230b33e5cd5929f Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Tue, 15 Oct 2024 09:09:00 +0000
Subject: [PATCH 6/8] Added support to handle f32 non-inline constant case, &
 addressed reviewd changes

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 44 +++++++++++++----------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index dd7871a4fa7339..2c36f10ecdb2e1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14481,49 +14481,55 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
-  EVT i32VT = VT.changeElementType(MVT::i32);
+  EVT IntVT = VT.changeElementType(MVT::i32);
 
   SDLoc SL(N);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
-  // ldexp(x, zext(i1 y)) -> fmul x, (select y, 2.0, 1.0)
-  // ldexp(x, sext(i1 y)) -> fmul x, (select y, 0.5, 1.0)
-  //
-  // The above mentioned ldexp folding works fine for
-  // bf16/f32, but as for f64 it creates f64 select which
-  // is costly to materialize as compared to f64 ldexp
-  // so here we undo the transform for f64 datatype.
-  // Also in case of f16, its cheaper to materialize inline
-  // 32 bit-constant (via ldexp use) rather than using fmul.
+  SDNodeFlags Flags = N->getFlags();
+  SDNodeFlags LHSFlags = LHS->getFlags();
+
+  // It is cheaper to realize i32 inline constants as compared against
+  // as materializing f16 or f64 (or even non-inline f32) values,
+  // possible via ldexp usage, as shown below :
   //
   // Given : A = 2^a  &  B = 2^b ; where a and b are integers.
   // fmul x, (select y, A, B)     -> ldexp( x, (select i32 y, a, b) )
   // fmul x, (select y, -A, -B)   -> ldexp( (fneg x), (select i32 y, a, b) )
-  // Note : It takes care of generic scenario which covers undoing
-  // of special case(zext/sext) as mentioned.
-  if (VT.getScalarType() == MVT::f64 || VT.getScalarType() == MVT::f16) {
+  if (VT.getScalarType() == MVT::f64 || VT.getScalarType() == MVT::f32 ||
+      VT.getScalarType() == MVT::f16) {
     if (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT) {
       const ConstantFPSDNode *TrueNode =
           isConstOrConstSplatFP(RHS.getOperand(1));
       const ConstantFPSDNode *FalseNode =
           isConstOrConstSplatFP(RHS.getOperand(2));
 
-      if (!TrueNode || !FalseNode)
+      bool AreNodesFP = TrueNode && FalseNode;
+      if (!AreNodesFP)
         return SDValue();
 
       if (TrueNode->isNegative() != FalseNode->isNegative())
         return SDValue();
-      LHS = TrueNode->isNegative() ? DAG.getNode(ISD::FNEG, SL, VT, LHS) : LHS;
 
+      // For f32, only non-inline constants should be transformed.
+      const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+      if (VT.getScalarType() == MVT::f32 &&
+          TII->isInlineConstant(TrueNode->getValueAPF()) &&
+          TII->isInlineConstant(FalseNode->getValueAPF()))
+        return SDValue();
+
+      LHS = TrueNode->isNegative()
+                ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHSFlags)
+                : LHS;
       int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
       int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
       if (TrueNodeExpVal != INT_MIN && FalseNodeExpVal != INT_MIN) {
         SDValue SelectNode =
-            DAG.getNode(ISD::SELECT, SL, i32VT, RHS.getOperand(0),
-                        DAG.getConstant(TrueNodeExpVal, SL, i32VT),
-                        DAG.getConstant(FalseNodeExpVal, SL, i32VT));
-        return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode);
+            DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
+                        DAG.getConstant(TrueNodeExpVal, SL, IntVT),
+                        DAG.getConstant(FalseNodeExpVal, SL, IntVT));
+        return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, Flags);
       }
     }
   }

>From 4b9d3a082527977d56f1877177e9ed14ccf07370 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Fri, 4 Oct 2024 06:53:37 +0000
Subject: [PATCH 7/8] Add test cases for fmul-select combine

This adds the f32/f64/f16/bf16 test cases for below pattern to see the
effect of the dag combine:

fmul x, select(y, A, B)

where A & B could be inline/non-inline values.
---
 .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll     | 2616 +++++++++++++++++
 1 file changed, 2616 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll

diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
new file mode 100644
index 00000000000000..fa38b9e2b357fe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -0,0 +1,2616 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX1030 %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX1100 %s
+
+define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f32_test1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f32_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, float 2.000000e+00, float 1.000000e+00
+  %ldexp = fmul float %x, %y
+  ret float %ldexp
+}
+
+define float @fmul_select_f32_test2(float %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f32_test2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f32_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, float 5.000000e-01, float 1.000000e+00
+  %ldexp = fmul float %x, %y
+  ret float %ldexp
+}
+
+define <2 x float> @fmul_select_v2f32_test3(<2 x float> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
+; GFX7-LABEL: fmul_select_v2f32_test3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_v2f32_test3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f32_test3:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f32_test3:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
+  %y = select <2 x i1> %bool, <2 x float> <float 2.000000e+00, float 2.000000e+00>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+  %ldexp = fmul <2 x float> %x, %y
+  ret <2 x float> %ldexp
+}
+
+define <2 x float> @fmul_select_v2f32_test4(<2 x float> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
+; GFX7-LABEL: fmul_select_v2f32_test4:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_v2f32_test4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f32_test4:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f32_test4:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
+  %y = select <2 x i1> %bool, <2 x float> <float 5.000000e-01, float 5.000000e-01>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+  %ldexp = fmul <2 x float> %x, %y
+  ret <2 x float> %ldexp
+}
+
+define float @fmul_select_f32_test5(float %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f32_test5:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f32_test5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test5:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test5:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, float -2.000000e+00, float -1.000000e+00
+  %ldexp = fmul float %x, %y
+  ret float %ldexp
+}
+
+define float @fmul_select_f32_test6(float %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f32_test6:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc0400000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f32_test6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc0400000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test6:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test6:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x41000000, v3, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, float -3.000000e+00, float 8.000000e+00
+  %ldexp = fmul float %x, %y
+  ret float %ldexp
+}
+
+define float @fmul_select_f32_test7(float %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f32_test7:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x5c
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, 59, vcc
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f32_test7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x5c
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, 59, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test7:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test7:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0x5c, 59, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, float 0x43A0000000000000, float 0x45B0000000000000
+  %ldexp = fmul float %x, %y
+  ret float %ldexp
+}
+
+define float @fmul_select_f32_test8(float %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f32_test8:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0x41800000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f32_test8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x41800000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test8:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test8:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xc1000000, v3, vcc_lo
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, float 1.600000e+01, float -8.000000e+00
+  %ldexp = fmul float %x, %y
+  ret float %ldexp
+}
+
+define float @fmul_select_f32_test9(float %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f32_test9:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f32_test9:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test9:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test9:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, float 0.000000e+00, float 2.000000e+00
+  %ldexp = fmul float %x, %y
+  ret float %ldexp
+}
+
+define float @fmul_select_f32_test10(float %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f32_test10:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f32_test10:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test10:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test10:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, float -0.000000e+00, float 0.000000e+00
+  %ldexp = fmul float %x, %y
+  ret float %ldexp
+}
+
+define float @fmul_select_f32_test11(float %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f32_test11:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x4e
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX7-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f32_test11:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4e
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test11:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test11:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 56, 0x4e, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, float 0xC4D0000000000000, float 0xC370000000000000
+  %ldexp = fmul float %x, %y
+  ret float %ldexp
+}
+
+define float @fmul_select_f32_test12(float %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f32_test12:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX7-NEXT:    v_not_b32_e32 v4, 47
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f32_test12:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX9-NEXT:    v_not_b32_e32 v4, 47
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f32_test12:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_not_b32_e32 v3, 47
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f32_test12:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_not_b32_e32 v3, 47
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x44, v3, vcc_lo
+; GFX1100-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, float 0x3CF0000000000000, float 0x4430000000000000
+  %ldexp = fmul float %x, %y
+  ret float %ldexp
+}
+
+define double @fmul_select_f64_test1(double %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f64_test1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f64_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, double 2.000000e+00, double 1.000000e+00
+  %ldexp = fmul double %x, %y
+  ret double %ldexp
+}
+
+define double @fmul_select_f64_test2(double %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f64_test2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX7-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f64_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, double 5.000000e-01, double 1.000000e+00
+  %ldexp = fmul double %x, %y
+  ret double %ldexp
+}
+
+define <2 x double> @fmul_select_v2f64_test3(<2 x double> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
+; GFX7-LABEL: fmul_select_v2f64_test3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX7-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_v2f64_test3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f64_test3:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f64_test3:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
+  %y = select <2 x i1> %bool, <2 x double> <double 2.000000e+00, double 2.000000e+00>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
+  %ldexp = fmul <2 x double> %x, %y
+  ret <2 x double> %ldexp
+}
+
+define <2 x double> @fmul_select_v2f64_test4(<2 x double> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
+; GFX7-LABEL: fmul_select_v2f64_test4:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX7-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX7-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_v2f64_test4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f64_test4:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f64_test4:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc_lo
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
+  %y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 5.000000e-01>, <2 x double> <double 1.000000e+00, double 1.000000e+00>
+  %ldexp = fmul <2 x double> %x, %y
+  ret <2 x double> %ldexp
+}
+
+define double @fmul_select_f64_test5(double %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f64_test5:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX7-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f64_test5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test5:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test5:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, double -5.000000e-01, double -1.000000e+00
+  %ldexp = fmul double %x, %y
+  ret double %ldexp
+}
+
+define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f64_test6:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f64_test6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test6:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test6:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, double -2.000000e+00, double -1.000000e+00
+  %ldexp = fmul double %x, %y
+  ret double %ldexp
+}
+
+define double @fmul_select_f64_test7(double %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f64_test7:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xbff00000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f64_test7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xbff00000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test7:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test7:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0xbff00000, 2.0, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, double 2.000000e+00, double -1.000000e+00
+  %ldexp = fmul double %x, %y
+  ret double %ldexp
+}
+
+define double @fmul_select_f64_test8(double %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f64_test8:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc
+; GFX7-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f64_test8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test8:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1030-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test8:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, double -4.000000e+00, double -3.200000e+01
+  %ldexp = fmul double %x, %y
+  ret double %ldexp
+}
+
+define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
+; GFX7-LABEL: fmul_select_v2f64_test9:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX7-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_v2f64_test9:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f64_test9:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f64_test9:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX1100-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
+  %y = select <2 x i1> %bool, <2 x double> <double -2.000000e+00, double -2.000000e+00>, <2 x double> <double -1.000000e+00, double -1.000000e+00>
+  %ldexp = fmul <2 x double> %x, %y
+  ret <2 x double> %ldexp
+}
+
+define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
+; GFX7-LABEL: fmul_select_v2f64_test10:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v8, 0xbff00000
+; GFX7-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX7-NEXT:    v_mov_b32_e32 v8, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX7-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_v2f64_test10:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0xbff00000
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f64_test10:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX1030-NEXT:    v_cndmask_b32_e32 v9, 0xbff00000, v8, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1030-NEXT:    v_mov_b32_e32 v8, 0
+; GFX1030-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX1030-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f64_test10:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mov_b32_e32 v8, 0x3fe00000
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_dual_cndmask_b32 v9, 0xbff00000, v8 :: v_dual_mov_b32 v8, 0
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX1100-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
+  %y = select <2 x i1> %bool, <2 x double> <double 5.000000e-01, double 2.000000e+00>, <2 x double> <double -1.000000e+00, double 1.000000e+00>
+  %ldexp = fmul <2 x double> %x, %y
+  ret <2 x double> %ldexp
+}
+
+define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f64_test11:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f64_test11:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test11:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
+; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test11:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0x80000000, -2.0, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, double -2.000000e+00, double -0.000000e+00
+  %ldexp = fmul double %x, %y
+  ret double %ldexp
+}
+
+define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f64_test12:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f64_test12:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 31, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test12:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
+; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1030-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 31, v3
+; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test12:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v3
+; GFX1100-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 31, v3
+; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, double 0.000000e+00, double -0.000000e+00
+  %ldexp = fmul double %x, %y
+  ret double %ldexp
+}
+
+define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f64_test13:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x40300000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX7-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f64_test13:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x40300000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test13:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1030-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1030-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
+; GFX1030-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test13:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1100-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1100-NEXT:    v_cndmask_b32_e64 v5, 0x40300000, 0, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, double 0.000000e+00, double 1.600000e+01
+  %ldexp = fmul double %x, %y
+  ret double %ldexp
+}
+
+define double @fmul_select_f64_test14(double %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f64_test14:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_not_b32_e32 v4, 26
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x5c
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX7-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f64_test14:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_not_b32_e32 v4, 26
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x5c
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test14:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test14:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0xffffffe5, v4, vcc_lo
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, double 0x45B0000000000000, double 0x3E40000000000000
+  %ldexp = fmul double %x, %y
+  ret double %ldexp
+}
+
+define double @fmul_select_f64_test15(double %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f64_test15:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_not_b32_e32 v4, 32
+; GFX7-NEXT:    v_not_b32_e32 v5, 41
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX7-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f64_test15:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_not_b32_e32 v4, 32
+; GFX9-NEXT:    v_not_b32_e32 v5, 41
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f64_test15:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_not_b32_e32 v4, 41
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f64_test15:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_not_b32_e32 v4, 41
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0xffffffdf, v4, vcc_lo
+; GFX1100-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, double 0x3D50000000000000, double 0x3DE0000000000000
+  %ldexp = fmul double %x, %y
+  ret double %ldexp
+}
+
+
+define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f16_test1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f16_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f16_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f16_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, half 2.000000e+00, half 1.000000e+00
+  %ldexp = fmul half %x, %y
+  ret half %ldexp
+}
+
+define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f16_test2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f16_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX9-NEXT:    v_med3_i32 v1, v1, s4, v2
+; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f16_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    s_movk_i32 s4, 0x8000
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX1030-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
+; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f16_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_movk_i32 s0, 0x8000
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, half 5.000000e-01, half 1.000000e+00
+  %ldexp = fmul half %x, %y
+  ret half %ldexp
+}
+
+define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
+; GFX7-LABEL: fmul_select_v2f16_test3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_v2f16_test3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3c00
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f16_test3:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX1030-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX1030-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f16_test3:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX1100-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
+  %y = select <2 x i1> %bool, <2 x half> <half 2.000000e+00, half 2.000000e+00>, <2 x half> <half 1.000000e+00, half 1.000000e+00>
+  %ldexp = fmul <2 x half> %x, %y
+  ret <2 x half> %ldexp
+}
+
+define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
+; GFX7-LABEL: fmul_select_v2f16_test4:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_v2f16_test4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3c00
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3800
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2f16_test4:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x3800
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX1030-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX1030-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2f16_test4:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x3800
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
+; GFX1100-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
+  %y = select <2 x i1> %bool, <2 x half> <half 5.000000e-01, half 5.000000e-01>, <2 x half> <half 1.000000e+00, half 1.000000e+00>
+  %ldexp = fmul <2 x half> %x, %y
+  ret <2 x half> %ldexp
+}
+
+define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f16_test5:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f16_test5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f16_test5:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f16_test5:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, half 2.000000e+00, half 8.000000e+00
+  %ldexp = fmul half %x, %y
+  ret half %ldexp
+}
+
+define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f16_test6:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f16_test6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4200
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc800
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f16_test6:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
+; GFX1030-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f16_test6:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
+; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, half -8.000000e+00, half 3.000000e+00
+  %ldexp = fmul half %x, %y
+  ret half %ldexp
+}
+
+define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f16_test7:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f16_test7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc400
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4800
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f16_test7:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
+; GFX1030-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f16_test7:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
+; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, half 8.000000e+00, half -4.000000e+00
+  %ldexp = fmul half %x, %y
+  ret half %ldexp
+}
+
+define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f16_test8:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f16_test8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f16_test8:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX1030-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f16_test8:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, half -0.000000e+00, half 0.000000e+00
+  %ldexp = fmul half %x, %y
+  ret half %ldexp
+}
+
+define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f16_test9:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f16_test9:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
+; GFX9-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f16_test9:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
+; GFX1030-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f16_test9:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, half -1.600000e+01, half -3.200000e+01
+  %ldexp = fmul half %x, %y
+  ret half %ldexp
+}
+
+define half @fmul_select_f16_test10(half %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f16_test10:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f16_test10:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX9-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX9-NEXT:    v_med3_i32 v1, v1, s4, v2
+; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f16_test10:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    s_movk_i32 s4, 0x8000
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX1030-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
+; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f16_test10:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_movk_i32 s0, 0x8000
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, half 0xH1000, half 0xH6800
+  %ldexp = fmul half %x, %y
+  ret half %ldexp
+}
+
+define half @fmul_select_f16_test11(half %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_f16_test11:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_f16_test11:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX9-NEXT:    s_movk_i32 s4, 0x8000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX9-NEXT:    v_med3_i32 v1, v1, s4, v2
+; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_f16_test11:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    s_movk_i32 s4, 0x8000
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX1030-NEXT:    v_med3_i32 v1, v1, s4, 0x7fff
+; GFX1030-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_f16_test11:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_movk_i32 s0, 0x8000
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX1100-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, half 0xH5800, half 0xH0400
+  %ldexp = fmul half %x, %y
+  ret half %ldexp
+}
+
+define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_bf16_test1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_bf16_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_bf16_test1:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_bf16_test1:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, bfloat 2.000000e+00, bfloat 1.000000e+00
+  %ldexp = fmul bfloat %x, %y
+  ret bfloat %ldexp
+}
+
+define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_bf16_test2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_bf16_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f00
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_bf16_test2:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x3f00
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_bf16_test2:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, bfloat 5.000000e-01, bfloat 1.000000e+00
+  %ldexp = fmul bfloat %x, %y
+  ret bfloat %ldexp
+}
+
+define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
+; GFX7-LABEL: fmul_select_v2bf16_test3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_v2bf16_test3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2bf16_test3:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX1030-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX1030-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX1030-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1030-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1030-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1030-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1030-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1030-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2bf16_test3:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX1100-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1100-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX1100-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1100-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1100-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1100-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1100-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
+  %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 2.000000e+00, bfloat 2.000000e+00>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
+  %ldexp = fmul <2 x bfloat> %x, %y
+  ret <2 x bfloat> %ldexp
+}
+
+define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
+; GFX7-LABEL: fmul_select_v2bf16_test4:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_v2bf16_test4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3f00
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_v2bf16_test4:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX1030-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX1030-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX1030-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX1030-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1030-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1030-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1030-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1030-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1030-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_v2bf16_test4:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX1100-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX1100-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX1100-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX1100-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX1100-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1100-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX1100-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
+  %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 5.000000e-01, bfloat 5.000000e-01>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
+  %ldexp = fmul <2 x bfloat> %x, %y
+  ret <2 x bfloat> %ldexp
+}
+
+define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_bf16_test5:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, 2.0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_bf16_test5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_bf16_test5:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_bf16_test5:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, bfloat 2.000000e+00, bfloat 8.000000e+00
+  %ldexp = fmul bfloat %x, %y
+  ret bfloat %ldexp
+}
+
+define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_bf16_test6:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_bf16_test6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4040
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc100
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_bf16_test6:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xffffc100
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_bf16_test6:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, bfloat -8.000000e+00, bfloat 3.000000e+00
+  %ldexp = fmul bfloat %x, %y
+  ret bfloat %ldexp
+}
+
+define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_bf16_test7:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_bf16_test7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc080
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4100
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_bf16_test7:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_bf16_test7:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, bfloat 8.000000e+00, bfloat -4.000000e+00
+  %ldexp = fmul bfloat %x, %y
+  ret bfloat %ldexp
+}
+
+define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_bf16_test8:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_bf16_test8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_bf16_test8:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1030-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_bf16_test8:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, bfloat -0.000000e+00, bfloat 0.000000e+00
+  %ldexp = fmul bfloat %x, %y
+  ret bfloat %ldexp
+}
+
+define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_bf16_test9:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0xc2000000
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1800000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_bf16_test9:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc200
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc180
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_bf16_test9:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xffffc180
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_bf16_test9:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, bfloat -1.600000e+01, bfloat -3.200000e+01
+  %ldexp = fmul bfloat %x, %y
+  ret bfloat %ldexp
+}
+
+define bfloat @fmul_select_bf16_test10(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_bf16_test10:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0xdb800000
+; GFX7-NEXT:    v_bfrev_b32_e32 v4, 7
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_bf16_test10:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffdb80
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffe000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_bf16_test10:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0xffffe000
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_bf16_test10:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, bfloat 0xRE000, bfloat 0xRDB80
+  %ldexp = fmul bfloat %x, %y
+  ret bfloat %ldexp
+}
+
+define bfloat @fmul_select_bf16_test11(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_bf16_test11:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_bfrev_b32_e32 v3, 50
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0x34800000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fmul_select_bf16_test11:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4c00
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3480
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-LABEL: fmul_select_bf16_test11:
+; GFX1030:       ; %bb.0:
+; GFX1030-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0x3480
+; GFX1030-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1030-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1030-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1030-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1030-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1030-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1030-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: fmul_select_bf16_test11:
+; GFX1100:       ; %bb.0:
+; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX1100-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX1100-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+  %bool = icmp eq i32 %bool.arg1, %bool.arg2
+  %y = select i1 %bool, bfloat 0xR3480, bfloat 0xR4C00
+  %ldexp = fmul bfloat %x, %y
+  ret bfloat %ldexp
+}
+

>From 832eb0f627a30dc9d07a02f324339e9107febc85 Mon Sep 17 00:00:00 2001
From: vikashgu <Vikash.Gupta at amd.com>
Date: Mon, 4 Nov 2024 08:14:58 +0000
Subject: [PATCH 8/8] Addressed the reviewed changes regarding early exits in
 program.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 30 +++++++++++++----------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2c36f10ecdb2e1..94964d0c8619c3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14481,6 +14481,7 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
+  EVT scalarVT = VT.getScalarType();
   EVT IntVT = VT.changeElementType(MVT::i32);
 
   SDLoc SL(N);
@@ -14497,16 +14498,15 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
   // Given : A = 2^a  &  B = 2^b ; where a and b are integers.
   // fmul x, (select y, A, B)     -> ldexp( x, (select i32 y, a, b) )
   // fmul x, (select y, -A, -B)   -> ldexp( (fneg x), (select i32 y, a, b) )
-  if (VT.getScalarType() == MVT::f64 || VT.getScalarType() == MVT::f32 ||
-      VT.getScalarType() == MVT::f16) {
+  if (scalarVT == MVT::f64 || scalarVT == MVT::f32 || scalarVT == MVT::f16) {
     if (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT) {
       const ConstantFPSDNode *TrueNode =
           isConstOrConstSplatFP(RHS.getOperand(1));
+      if (!TrueNode)
+        return SDValue();
       const ConstantFPSDNode *FalseNode =
           isConstOrConstSplatFP(RHS.getOperand(2));
-
-      bool AreNodesFP = TrueNode && FalseNode;
-      if (!AreNodesFP)
+      if (!FalseNode)
         return SDValue();
 
       if (TrueNode->isNegative() != FalseNode->isNegative())
@@ -14514,7 +14514,7 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
 
       // For f32, only non-inline constants should be transformed.
       const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
-      if (VT.getScalarType() == MVT::f32 &&
+      if (scalarVT == MVT::f32 &&
           TII->isInlineConstant(TrueNode->getValueAPF()) &&
           TII->isInlineConstant(FalseNode->getValueAPF()))
         return SDValue();
@@ -14522,15 +14522,19 @@ SDValue SITargetLowering::performFMulCombine(SDNode *N,
       LHS = TrueNode->isNegative()
                 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHSFlags)
                 : LHS;
+
       int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
+      if (TrueNodeExpVal == INT_MIN)
+        return SDValue();
       int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
-      if (TrueNodeExpVal != INT_MIN && FalseNodeExpVal != INT_MIN) {
-        SDValue SelectNode =
-            DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
-                        DAG.getConstant(TrueNodeExpVal, SL, IntVT),
-                        DAG.getConstant(FalseNodeExpVal, SL, IntVT));
-        return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, Flags);
-      }
+      if (FalseNodeExpVal == INT_MIN)
+        return SDValue();
+
+      SDValue SelectNode =
+          DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
+                      DAG.getConstant(TrueNodeExpVal, SL, IntVT),
+                      DAG.getConstant(FalseNodeExpVal, SL, IntVT));
+      return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, Flags);
     }
   }
 



More information about the llvm-commits mailing list