[llvm] [AArch64] Spare N2I roundtrip when splatting float comparison (PR #141806)

Fri Jun 6 05:45:22 PDT 2025

https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/141806

>From 8fa8f130e68a6c3bfdc7890e86a4c62653321884 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Wed, 28 May 2025 19:55:44 +0300
Subject: [PATCH] [AArch64] Spare N2I roundtrip when splatting float comparison

Transform `select_cc t1, t2, -1, 0` for floats into a vector comparison which
generates a mask, which is later on combined with potential vectorized DUPs.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 225 ++++++++---
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   4 +-
 .../CodeGen/AArch64/arm64-neon-v1i1-setcc.ll  |   6 +-
 .../CodeGen/AArch64/build-vector-dup-simd.ll  | 378 ++++++++++++++++++
 4 files changed, 557 insertions(+), 56 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/build-vector-dup-simd.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9f51caef6d228..58e77ea0e6dcc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11002,10 +11002,126 @@ SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
                      Cmp.getValue(1));
 }
 
-SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
-                                              SDValue RHS, SDValue TVal,
-                                              SDValue FVal, const SDLoc &dl,
-                                              SelectionDAG &DAG) const {
+/// Emit vector comparison for floating-point values, producing a mask.
+static SDValue emitVectorComparison(SDValue LHS, SDValue RHS,
+                                    AArch64CC::CondCode CC, bool NoNans, EVT VT,
+                                    const SDLoc &DL, SelectionDAG &DAG) {
+  EVT SrcVT = LHS.getValueType();
+  assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
+         "function only supposed to emit natural comparisons");
+
+  switch (CC) {
+  default:
+    return SDValue();
+  case AArch64CC::NE: {
+    SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
+    // Use vector semantics for the inversion to potentially save a copy between
+    // SIMD and regular registers.
+    if (!LHS.getValueType().isVector()) {
+      EVT VecVT =
+          EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
+      SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+      SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
+                                    DAG.getUNDEF(VecVT), Fcmeq, Zero);
+      SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
+    }
+    return DAG.getNOT(DL, Fcmeq, VT);
+  }
+  case AArch64CC::EQ:
+    return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
+  case AArch64CC::GE:
+    return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
+  case AArch64CC::GT:
+    return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
+  case AArch64CC::LE:
+    if (!NoNans)
+      return SDValue();
+    // If we ignore NaNs then we can use to the LS implementation.
+    [[fallthrough]];
+  case AArch64CC::LS:
+    return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
+  case AArch64CC::LT:
+    if (!NoNans)
+      return SDValue();
+    // If we ignore NaNs then we can use to the MI implementation.
+    [[fallthrough]];
+  case AArch64CC::MI:
+    return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
+  }
+}
+
+/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
+/// values are scalars, try to emit a mask generating vector instruction.
+static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal,
+                                    SDValue FVal, ISD::CondCode CC, bool NoNaNs,
+                                    const SDLoc &DL, SelectionDAG &DAG) {
+  assert(!LHS.getValueType().isVector());
+  assert(!RHS.getValueType().isVector());
+
+  auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
+  auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
+  if (!CTVal || !CFVal)
+    return {};
+  if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
+      !(CTVal->isZero() && CFVal->isAllOnes()))
+    return {};
+
+  if (CTVal->isZero())
+    CC = ISD::getSetCCInverse(CC, LHS.getValueType());
+
+  EVT VT = TVal.getValueType();
+  if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
+    return {};
+
+  if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
+    bool OneNaN = false;
+    if (LHS == RHS) {
+      OneNaN = true;
+    } else if (DAG.isKnownNeverNaN(RHS)) {
+      OneNaN = true;
+      RHS = LHS;
+    } else if (DAG.isKnownNeverNaN(LHS)) {
+      OneNaN = true;
+      LHS = RHS;
+    }
+    if (OneNaN)
+      CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
+  }
+
+  AArch64CC::CondCode CC1;
+  AArch64CC::CondCode CC2;
+  bool ShouldInvert = false;
+  changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
+  SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
+  SDValue Cmp2;
+  if (CC2 != AArch64CC::AL) {
+    Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
+    if (!Cmp2)
+      return {};
+  }
+  if (!Cmp2 && !ShouldInvert)
+    return Cmp;
+
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,
+                    Zero);
+  if (Cmp2) {
+    Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),
+                       Cmp2, Zero);
+    Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
+  }
+  if (ShouldInvert)
+    Cmp = DAG.getNOT(DL, Cmp, VecVT);
+  Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
+  return Cmp;
+}
+
+SDValue AArch64TargetLowering::LowerSELECT_CC(
+    ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal,
+    iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs,
+    const SDLoc &dl, SelectionDAG &DAG) const {
   // Handle f128 first, because it will result in a comparison of some RTLIB
   // call result against zero.
   if (LHS.getValueType() == MVT::f128) {
@@ -11188,6 +11304,27 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
          LHS.getValueType() == MVT::f64);
   assert(LHS.getValueType() == RHS.getValueType());
   EVT VT = TVal.getValueType();
+
+  // If the purpose of the comparison is to select between all ones
+  // or all zeros, try to use a vector comparison because the operands are
+  // already stored in SIMD registers.
+  if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
+        switch (U->getOpcode()) {
+        default:
+          return false;
+        case ISD::INSERT_VECTOR_ELT:
+        case ISD::SCALAR_TO_VECTOR:
+        case AArch64ISD::DUP:
+          return true;
+        }
+      })) {
+    bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || HasNoNaNs;
+    SDValue VectorCmp =
+        emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, dl, DAG);
+    if (VectorCmp)
+      return VectorCmp;
+  }
+
   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
 
   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
@@ -11274,8 +11411,10 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
   SDValue RHS = Op.getOperand(1);
   SDValue TVal = Op.getOperand(2);
   SDValue FVal = Op.getOperand(3);
+  bool HasNoNans = Op->getFlags().hasNoNaNs();
   SDLoc DL(Op);
-  return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
+  return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL,
+                        DAG);
 }
 
 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
@@ -11283,6 +11422,7 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
   SDValue CCVal = Op->getOperand(0);
   SDValue TVal = Op->getOperand(1);
   SDValue FVal = Op->getOperand(2);
+  bool HasNoNans = Op->getFlags().hasNoNaNs();
   SDLoc DL(Op);
 
   EVT Ty = Op.getValueType();
@@ -11349,7 +11489,8 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
                                      DAG.getUNDEF(MVT::f32), FVal);
   }
 
-  SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
+  SDValue Res =
+      LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), HasNoNans, DL, DAG);
 
   if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
     return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
@@ -15602,47 +15743,6 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
   llvm_unreachable("unexpected shift opcode");
 }
 
-static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
-                                    AArch64CC::CondCode CC, bool NoNans, EVT VT,
-                                    const SDLoc &dl, SelectionDAG &DAG) {
-  EVT SrcVT = LHS.getValueType();
-  assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
-         "function only supposed to emit natural comparisons");
-
-  if (SrcVT.getVectorElementType().isFloatingPoint()) {
-    switch (CC) {
-    default:
-      return SDValue();
-    case AArch64CC::NE: {
-      SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
-      return DAG.getNOT(dl, Fcmeq, VT);
-    }
-    case AArch64CC::EQ:
-      return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
-    case AArch64CC::GE:
-      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
-    case AArch64CC::GT:
-      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
-    case AArch64CC::LE:
-      if (!NoNans)
-        return SDValue();
-      // If we ignore NaNs then we can use to the LS implementation.
-      [[fallthrough]];
-    case AArch64CC::LS:
-      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
-    case AArch64CC::LT:
-      if (!NoNans)
-        return SDValue();
-      // If we ignore NaNs then we can use to the MI implementation.
-      [[fallthrough]];
-    case AArch64CC::MI:
-      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
-    }
-  }
-
-  return SDValue();
-}
-
 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
                                            SelectionDAG &DAG) const {
   if (Op.getValueType().isScalableVector())
@@ -15691,15 +15791,14 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
   bool ShouldInvert;
   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
 
-  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
-  SDValue Cmp =
-      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
+  bool NoNaNs =
+      getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
+  SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
   if (!Cmp.getNode())
     return SDValue();
 
   if (CC2 != AArch64CC::AL) {
-    SDValue Cmp2 =
-        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
+    SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
     if (!Cmp2.getNode())
       return SDValue();
 
@@ -25456,6 +25555,28 @@ static SDValue performDUPCombine(SDNode *N,
   }
 
   if (N->getOpcode() == AArch64ISD::DUP) {
+    // If the instruction is known to produce a scalar in SIMD registers, we can
+    // duplicate it across the vector lanes using DUPLANE instead of moving it
+    // to a GPR first. For example, this allows us to handle:
+    //   v4i32 = DUP (i32 (FCMGT (f32, f32)))
+    SDValue Op = N->getOperand(0);
+    // FIXME: Ideally, we should be able to handle all instructions that
+    // produce a scalar value in FPRs.
+    if (Op.getOpcode() == AArch64ISD::FCMEQ ||
+        Op.getOpcode() == AArch64ISD::FCMGE ||
+        Op.getOpcode() == AArch64ISD::FCMGT) {
+      EVT ElemVT = VT.getVectorElementType();
+      EVT ExpandedVT = VT;
+      // Insert into a 128-bit vector to match DUPLANE's pattern.
+      if (VT.getSizeInBits() != 128)
+        ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
+                                      128 / ElemVT.getSizeInBits());
+      SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
+      SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
+                                    DCI.DAG.getUNDEF(ExpandedVT), Op, Zero);
+      return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
+    }
+
     if (DCI.isAfterLegalizeDAG()) {
       // If scalar dup's operand is extract_vector_elt, try to combine them into
       // duplane. For example,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index b2174487c2fe8..8048cca203b66 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -643,7 +643,9 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
-                         SDValue TVal, SDValue FVal, const SDLoc &dl,
+                         SDValue TVal, SDValue FVal,
+                         iterator_range<SDNode::user_iterator> Users,
+                         bool HasNoNans, const SDLoc &dl,
                          SelectionDAG &DAG) const;
   SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
index 6c70d19a977a5..1f2b1f8dd28ff 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
@@ -174,9 +174,9 @@ define <1 x i16> @test_select_f16_i16(half %i105, half %in, <1 x i16> %x, <1 x i
 ; CHECK-LABEL: test_select_f16_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fcmp s0, s0
-; CHECK-NEXT:    csetm w8, vs
-; CHECK-NEXT:    dup v0.4h, w8
+; CHECK-NEXT:    fcmeq s0, s0, s0
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
 ; CHECK-NEXT:    bsl v0.8b, v2.8b, v3.8b
 ; CHECK-NEXT:    ret
   %i179 = fcmp uno half %i105, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/build-vector-dup-simd.ll b/llvm/test/CodeGen/AArch64/build-vector-dup-simd.ll
new file mode 100644
index 0000000000000..ac0b8e89519dd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/build-vector-dup-simd.ll
@@ -0,0 +1,378 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-NOFULLFP16
+; RUN: llc < %s -mtriple=aarch64 --enable-no-nans-fp-math | FileCheck %s --check-prefixes=CHECK,CHECK-NONANS
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16  | FileCheck %s --check-prefixes=CHECK,CHECK-FULLFP16
+
+define <1 x float> @dup_v1i32_oeq(float %a, float %b) {
+; CHECK-LABEL: dup_v1i32_oeq:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmeq s0, s0, s1
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp oeq float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_ogt(float %a, float %b) {
+; CHECK-LABEL: dup_v1i32_ogt:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmgt s0, s0, s1
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp ogt float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_oge(float %a, float %b) {
+; CHECK-LABEL: dup_v1i32_oge:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmge s0, s0, s1
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp oge float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_olt(float %a, float %b) {
+; CHECK-LABEL: dup_v1i32_olt:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmgt s0, s1, s0
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp olt float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_ole(float %a, float %b) {
+; CHECK-LABEL: dup_v1i32_ole:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmge s0, s1, s0
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp ole float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_one(float %a, float %b) {
+; CHECK-NOFULLFP16-LABEL: dup_v1i32_one:
+; CHECK-NOFULLFP16:       // %bb.0: // %entry
+; CHECK-NOFULLFP16-NEXT:    fcmgt s2, s0, s1
+; CHECK-NOFULLFP16-NEXT:    fcmgt s0, s1, s0
+; CHECK-NOFULLFP16-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NOFULLFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NOFULLFP16-NEXT:    ret
+;
+; CHECK-NONANS-LABEL: dup_v1i32_one:
+; CHECK-NONANS:       // %bb.0: // %entry
+; CHECK-NONANS-NEXT:    fcmeq s0, s0, s1
+; CHECK-NONANS-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NONANS-NEXT:    ret
+;
+; CHECK-FULLFP16-LABEL: dup_v1i32_one:
+; CHECK-FULLFP16:       // %bb.0: // %entry
+; CHECK-FULLFP16-NEXT:    fcmgt s2, s0, s1
+; CHECK-FULLFP16-NEXT:    fcmgt s0, s1, s0
+; CHECK-FULLFP16-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-FULLFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-FULLFP16-NEXT:    ret
+entry:
+  %0 = fcmp one float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_ord(float %a, float %b) {
+; CHECK-LABEL: dup_v1i32_ord:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmge s2, s0, s1
+; CHECK-NEXT:    fcmgt s0, s1, s0
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp ord float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_ueq(float %a, float %b) {
+; CHECK-NOFULLFP16-LABEL: dup_v1i32_ueq:
+; CHECK-NOFULLFP16:       // %bb.0: // %entry
+; CHECK-NOFULLFP16-NEXT:    fcmgt s2, s0, s1
+; CHECK-NOFULLFP16-NEXT:    fcmgt s0, s1, s0
+; CHECK-NOFULLFP16-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NOFULLFP16-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NOFULLFP16-NEXT:    ret
+;
+; CHECK-NONANS-LABEL: dup_v1i32_ueq:
+; CHECK-NONANS:       // %bb.0: // %entry
+; CHECK-NONANS-NEXT:    fcmeq s0, s0, s1
+; CHECK-NONANS-NEXT:    ret
+;
+; CHECK-FULLFP16-LABEL: dup_v1i32_ueq:
+; CHECK-FULLFP16:       // %bb.0: // %entry
+; CHECK-FULLFP16-NEXT:    fcmgt s2, s0, s1
+; CHECK-FULLFP16-NEXT:    fcmgt s0, s1, s0
+; CHECK-FULLFP16-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-FULLFP16-NEXT:    mvn v0.8b, v0.8b
+; CHECK-FULLFP16-NEXT:    ret
+entry:
+  %0 = fcmp ueq float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_ugt(float %a, float %b) {
+; CHECK-NOFULLFP16-LABEL: dup_v1i32_ugt:
+; CHECK-NOFULLFP16:       // %bb.0: // %entry
+; CHECK-NOFULLFP16-NEXT:    fcmge s0, s1, s0
+; CHECK-NOFULLFP16-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NOFULLFP16-NEXT:    ret
+;
+; CHECK-NONANS-LABEL: dup_v1i32_ugt:
+; CHECK-NONANS:       // %bb.0: // %entry
+; CHECK-NONANS-NEXT:    fcmgt s0, s0, s1
+; CHECK-NONANS-NEXT:    ret
+;
+; CHECK-FULLFP16-LABEL: dup_v1i32_ugt:
+; CHECK-FULLFP16:       // %bb.0: // %entry
+; CHECK-FULLFP16-NEXT:    fcmge s0, s1, s0
+; CHECK-FULLFP16-NEXT:    mvn v0.8b, v0.8b
+; CHECK-FULLFP16-NEXT:    ret
+entry:
+  %0 = fcmp ugt float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_uge(float %a, float %b) {
+; CHECK-NOFULLFP16-LABEL: dup_v1i32_uge:
+; CHECK-NOFULLFP16:       // %bb.0: // %entry
+; CHECK-NOFULLFP16-NEXT:    fcmgt s0, s1, s0
+; CHECK-NOFULLFP16-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NOFULLFP16-NEXT:    ret
+;
+; CHECK-NONANS-LABEL: dup_v1i32_uge:
+; CHECK-NONANS:       // %bb.0: // %entry
+; CHECK-NONANS-NEXT:    fcmge s0, s0, s1
+; CHECK-NONANS-NEXT:    ret
+;
+; CHECK-FULLFP16-LABEL: dup_v1i32_uge:
+; CHECK-FULLFP16:       // %bb.0: // %entry
+; CHECK-FULLFP16-NEXT:    fcmgt s0, s1, s0
+; CHECK-FULLFP16-NEXT:    mvn v0.8b, v0.8b
+; CHECK-FULLFP16-NEXT:    ret
+entry:
+  %0 = fcmp uge float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_ult(float %a, float %b) {
+; CHECK-NOFULLFP16-LABEL: dup_v1i32_ult:
+; CHECK-NOFULLFP16:       // %bb.0: // %entry
+; CHECK-NOFULLFP16-NEXT:    fcmge s0, s0, s1
+; CHECK-NOFULLFP16-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NOFULLFP16-NEXT:    ret
+;
+; CHECK-NONANS-LABEL: dup_v1i32_ult:
+; CHECK-NONANS:       // %bb.0: // %entry
+; CHECK-NONANS-NEXT:    fcmgt s0, s1, s0
+; CHECK-NONANS-NEXT:    ret
+;
+; CHECK-FULLFP16-LABEL: dup_v1i32_ult:
+; CHECK-FULLFP16:       // %bb.0: // %entry
+; CHECK-FULLFP16-NEXT:    fcmge s0, s0, s1
+; CHECK-FULLFP16-NEXT:    mvn v0.8b, v0.8b
+; CHECK-FULLFP16-NEXT:    ret
+entry:
+  %0 = fcmp ult float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_ule(float %a, float %b) {
+; CHECK-NOFULLFP16-LABEL: dup_v1i32_ule:
+; CHECK-NOFULLFP16:       // %bb.0: // %entry
+; CHECK-NOFULLFP16-NEXT:    fcmgt s0, s0, s1
+; CHECK-NOFULLFP16-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NOFULLFP16-NEXT:    ret
+;
+; CHECK-NONANS-LABEL: dup_v1i32_ule:
+; CHECK-NONANS:       // %bb.0: // %entry
+; CHECK-NONANS-NEXT:    fcmge s0, s1, s0
+; CHECK-NONANS-NEXT:    ret
+;
+; CHECK-FULLFP16-LABEL: dup_v1i32_ule:
+; CHECK-FULLFP16:       // %bb.0: // %entry
+; CHECK-FULLFP16-NEXT:    fcmgt s0, s0, s1
+; CHECK-FULLFP16-NEXT:    mvn v0.8b, v0.8b
+; CHECK-FULLFP16-NEXT:    ret
+entry:
+  %0 = fcmp ule float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_une(float %a, float %b) {
+; CHECK-LABEL: dup_v1i32_une:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmeq s0, s0, s1
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp une float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <1 x float> @dup_v1i32_uno(float %a, float %b) {
+; CHECK-LABEL: dup_v1i32_uno:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmge s2, s0, s1
+; CHECK-NEXT:    fcmgt s0, s1, s0
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp uno float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <1 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <1 x i32> %vecinit.i to <1 x float>
+  ret <1 x float> %1
+}
+
+define <4 x float> @dup_v4i32(float %a, float %b) {
+; CHECK-LABEL: dup_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmge s0, s0, s1
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp oge float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <4 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <4 x i32> %vecinit.i to <4 x float>
+  %2 = shufflevector <4 x float> %1, <4 x float> poison, <4 x i32> zeroinitializer
+  ret <4 x float> %2
+}
+
+define <4 x float> @dup_v4i32_reversed(float %a, float %b) {
+; CHECK-LABEL: dup_v4i32_reversed:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmgt s0, s1, s0
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp ogt float %b, %a
+  %vcmpd.i = sext i1 %0 to i32
+  %vecinit.i = insertelement <4 x i32> poison, i32 %vcmpd.i, i64 0
+  %1 = bitcast <4 x i32> %vecinit.i to <4 x float>
+  %2 = shufflevector <4 x float> %1, <4 x float> poison, <4 x i32> zeroinitializer
+  ret <4 x float> %2
+}
+
+define <2 x double> @dup_v2i64(double %a, double %b) {
+; CHECK-LABEL: dup_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmgt d0, d0, d1
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp ogt double %a, %b
+  %vcmpd.i = sext i1 %0 to i64
+  %vecinit.i = insertelement <2 x i64> poison, i64 %vcmpd.i, i64 0
+  %1 = bitcast <2 x i64> %vecinit.i to <2 x double>
+  %2 = shufflevector <2 x double> %1, <2 x double> poison, <2 x i32> zeroinitializer
+  ret <2 x double> %2
+}
+
+define <8 x half> @dup_v8i16(half %a, half %b) {
+; CHECK-NOFULLFP16-LABEL: dup_v8i16:
+; CHECK-NOFULLFP16:       // %bb.0: // %entry
+; CHECK-NOFULLFP16-NEXT:    fcvt s1, h1
+; CHECK-NOFULLFP16-NEXT:    fcvt s0, h0
+; CHECK-NOFULLFP16-NEXT:    fcmeq s0, s0, s1
+; CHECK-NOFULLFP16-NEXT:    ret
+;
+; CHECK-NONANS-LABEL: dup_v8i16:
+; CHECK-NONANS:       // %bb.0: // %entry
+; CHECK-NONANS-NEXT:    fcvt s1, h1
+; CHECK-NONANS-NEXT:    fcvt s0, h0
+; CHECK-NONANS-NEXT:    fcmeq s0, s0, s1
+; CHECK-NONANS-NEXT:    ret
+;
+; CHECK-FULLFP16-LABEL: dup_v8i16:
+; CHECK-FULLFP16:       // %bb.0: // %entry
+; CHECK-FULLFP16-NEXT:    fcmp h0, h1
+; CHECK-FULLFP16-NEXT:    csetm w8, eq
+; CHECK-FULLFP16-NEXT:    fmov s0, w8
+; CHECK-FULLFP16-NEXT:    ret
+; FIXME: Could be replaced with fcmeq + dup but the type of the former is
+; promoted to i32 during selection and then the optimization does not apply.
+
+  entry:
+  %0 = fcmp oeq half %a, %b
+  %vcmpd.i = sext i1 %0 to i16
+  %vecinit.i = insertelement <8 x i16> poison, i16 %vcmpd.i, i64 0
+  %1 = bitcast <8 x i16> %vecinit.i to <8 x half>
+  ret <8 x half> %1
+}
+
+; Check that a mask is not generated for non-vectorized users.
+define i32 @mask_i32(float %a, float %b) {
+; CHECK-LABEL: mask_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    csetm w0, eq
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp oeq float %a, %b
+  %vcmpd.i = sext i1 %0 to i32
+  ret i32 %vcmpd.i
+}
+
+; Verify that a mask is not emitted when (allOnes, allZeros) are not the
+; operands for the SELECT_CC.
+define i32 @bool_i32(float %a, float %b) {
+; CHECK-LABEL: bool_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+entry:
+  %0 = fcmp oeq float %a, %b
+  %vcmpd.i = zext i1 %0 to i32
+  ret i32 %vcmpd.i
+}