[llvm-branch-commits] [llvm] [ConstantTime] Native ct.select support for ARM64 (PR #166706)

Sat Dec 6 22:06:41 PST 2025

https://github.com/wizardengineer updated https://github.com/llvm/llvm-project/pull/166706

>From a709071b6b356df84b4579cd09ee32e7895e1b5a Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 17:09:45 -0500
Subject: [PATCH] [LLVM][AArch64] Add native ct.select support for ARM64

This patch implements architecture-specific lowering for ct.select on AArch64
using CSEL (conditional select) instructions for constant-time selection.

Implementation details:
- Uses CSEL family of instructions for scalar integer types
- Uses FCSEL for floating-point types (F16, BF16, F32, F64)
- Post-RA MC lowering to convert pseudo-instructions to real CSEL/FCSEL
- Handles vector types appropriately
- Comprehensive test coverage for AArch64

The implementation includes:
- ISelLowering: Custom lowering to CTSELECT pseudo-instructions
- InstrInfo: Pseudo-instruction definitions and patterns
- MCInstLower: Post-RA lowering of pseudo-instructions to actual CSEL/FCSEL
- Proper handling of condition codes for constant-time guarantees
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  56 +++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  11 ++
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  39 ++++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  40 +++++
 .../lib/Target/AArch64/AArch64MCInstLower.cpp |  18 +++
 llvm/test/CodeGen/AArch64/ctselect.ll         | 153 ++++++++++++++++++
 6 files changed, 313 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/ctselect.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7199319ccdd9f..884f7ef59d5b0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -505,12 +505,36 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
   setOperationAction(ISD::SELECT, MVT::i32, Custom);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
+  setOperationAction(ISD::CTSELECT, MVT::i8, Promote);
+  setOperationAction(ISD::CTSELECT, MVT::i16, Promote);
+  setOperationAction(ISD::CTSELECT, MVT::i32, Custom);
+  setOperationAction(ISD::CTSELECT, MVT::i64, Custom);
   if (Subtarget->hasFPARMv8()) {
     setOperationAction(ISD::SELECT, MVT::f16, Custom);
     setOperationAction(ISD::SELECT, MVT::bf16, Custom);
   }
+  if (Subtarget->hasFullFP16()) {
+    setOperationAction(ISD::CTSELECT, MVT::f16, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::bf16, Custom);
+  } else {
+    setOperationAction(ISD::CTSELECT, MVT::f16, Promote);
+    setOperationAction(ISD::CTSELECT, MVT::bf16, Promote);
+  }
   setOperationAction(ISD::SELECT, MVT::f32, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Custom);
+  setOperationAction(ISD::CTSELECT, MVT::f32, Custom);
+  setOperationAction(ISD::CTSELECT, MVT::f64, Custom);
+  for (MVT VT : MVT::vector_valuetypes()) {
+    MVT elemType = VT.getVectorElementType();
+    if (elemType == MVT::i8 || elemType == MVT::i16) {
+      setOperationAction(ISD::CTSELECT, VT, Promote);
+    } else if ((elemType == MVT::f16 || elemType == MVT::bf16) &&
+               !Subtarget->hasFullFP16()) {
+      setOperationAction(ISD::CTSELECT, VT, Promote);
+    } else {
+      setOperationAction(ISD::CTSELECT, VT, Expand);
+    }
+  }
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
@@ -3375,6 +3399,20 @@ void AArch64TargetLowering::fixupPtrauthDiscriminator(
   IntDiscOp.setImm(IntDisc);
 }
 
+MachineBasicBlock *AArch64TargetLowering::EmitCTSELECT(MachineInstr &MI,
+                                                       MachineBasicBlock *MBB,
+                                                       unsigned Opcode) const {
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineInstrBuilder Builder = BuildMI(*MBB, MI, DL, TII->get(Opcode));
+  for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) {
+    Builder.add(MI.getOperand(Idx));
+  }
+  Builder->setFlag(MachineInstr::NoMerge);
+  MBB->remove_instr(&MI);
+  return MBB;
+}
+
 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB) const {
 
@@ -7862,6 +7900,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:
     return LowerSELECT_CC(Op, DAG);
+  case ISD::CTSELECT:
+    return LowerCTSELECT(Op, DAG);
   case ISD::JumpTable:
     return LowerJumpTable(Op, DAG);
   case ISD::BR_JT:
@@ -12439,6 +12479,22 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
   return Res;
 }
 
+SDValue AArch64TargetLowering::LowerCTSELECT(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDValue CCVal = Op->getOperand(0);
+  SDValue TVal = Op->getOperand(1);
+  SDValue FVal = Op->getOperand(2);
+  SDLoc DL(Op);
+
+  EVT VT = Op.getValueType();
+
+  SDValue Zero = DAG.getConstant(0, DL, CCVal.getValueType());
+  SDValue CC;
+  SDValue Cmp = getAArch64Cmp(CCVal, Zero, ISD::SETNE, CC, DAG, DL);
+
+  return DAG.getNode(AArch64ISD::CTSELECT, DL, VT, TVal, FVal, CC, Cmp);
+}
+
 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
                                               SelectionDAG &DAG) const {
   // Jump table entries as PC relative offsets. No additional tweaking
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e8c026d989eb8..9a78e516cfec0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -23,6 +23,11 @@
 
 namespace llvm {
 
+namespace AArch64ISD {
+// Forward declare the enum from the generated file
+enum GenNodeType : unsigned;
+} // namespace AArch64ISD
+
 class AArch64TargetMachine;
 
 namespace AArch64 {
@@ -202,6 +207,9 @@ class AArch64TargetLowering : public TargetLowering {
                                  MachineOperand &AddrDiscOp,
                                  const TargetRegisterClass *AddrDiscRC) const;
 
+  MachineBasicBlock *EmitCTSELECT(MachineInstr &MI, MachineBasicBlock *BB,
+                                  unsigned Opcode) const;
+
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
@@ -691,6 +699,7 @@ class AArch64TargetLowering : public TargetLowering {
                          iterator_range<SDNode::user_iterator> Users,
                          SDNodeFlags Flags, const SDLoc &dl,
                          SelectionDAG &DAG) const;
+  SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
@@ -927,6 +936,8 @@ class AArch64TargetLowering : public TargetLowering {
   bool hasMultipleConditionRegisters(EVT VT) const override {
     return VT.isScalableVector();
   }
+
+  bool isSelectSupported(SelectSupportKind Kind) const override { return true; }
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 904577b8233d5..cb8257265723a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2334,16 +2334,47 @@ bool AArch64InstrInfo::removeCmpToZeroOrOne(
   return true;
 }
 
-bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
-  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
-      MI.getOpcode() != AArch64::CATCHRET)
-    return false;
+static inline void expandCtSelect(MachineBasicBlock &MBB, MachineInstr &MI,
+                                  DebugLoc &DL, const MCInstrDesc &MCID) {
+  MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, MCID);
+  for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) {
+    Builder.add(MI.getOperand(Idx));
+  }
+  Builder->setFlag(MachineInstr::NoMerge);
+  MBB.remove_instr(&MI);
+}
 
+bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MachineBasicBlock &MBB = *MI.getParent();
   auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
   auto TRI = Subtarget.getRegisterInfo();
   DebugLoc DL = MI.getDebugLoc();
 
+  switch (MI.getOpcode()) {
+  case AArch64::I32CTSELECT:
+    expandCtSelect(MBB, MI, DL, get(AArch64::CSELWr));
+    return true;
+  case AArch64::I64CTSELECT:
+    expandCtSelect(MBB, MI, DL, get(AArch64::CSELXr));
+    return true;
+  case AArch64::BF16CTSELECT:
+    expandCtSelect(MBB, MI, DL, get(AArch64::FCSELHrrr));
+    return true;
+  case AArch64::F16CTSELECT:
+    expandCtSelect(MBB, MI, DL, get(AArch64::FCSELHrrr));
+    return true;
+  case AArch64::F32CTSELECT:
+    expandCtSelect(MBB, MI, DL, get(AArch64::FCSELSrrr));
+    return true;
+  case AArch64::F64CTSELECT:
+    expandCtSelect(MBB, MI, DL, get(AArch64::FCSELDrrr));
+    return true;
+  }
+
+  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
+      MI.getOpcode() != AArch64::CATCHRET)
+    return false;
+
   if (MI.getOpcode() == AArch64::CATCHRET) {
     // Skip to the first instruction before the epilog.
     const TargetInstrInfo *TII =
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 64017d7cafca3..82b7496a2ed83 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -482,6 +482,9 @@ def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
 def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
                                         SDTCisVT<2, OtherVT>]>;
 
+def SDT_AArch64CtSelect : SDTypeProfile<1, 4,
+                                        [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                         SDTCisInt<3>, SDTCisVT<4, i32>]>;
 def SDT_AArch64CSel  : SDTypeProfile<1, 4,
                                    [SDTCisSameAs<0, 1>,
                                     SDTCisSameAs<0, 2>,
@@ -855,6 +858,7 @@ def AArch64tbz           : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
 def AArch64tbnz           : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
                                 [SDNPHasChain]>;
 
+def AArch64ctselect : SDNode<"AArch64ISD::CTSELECT", SDT_AArch64CtSelect>;
 
 def AArch64csel          : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
 // Conditional select invert.
@@ -5673,6 +5677,42 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd),
   let hasNoSchedulingInfo = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Constant-time conditional selection instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 1, isPseudo = 1, hasNoSchedulingInfo = 1,
+    Uses = [NZCV] in {
+  def I32CTSELECT
+      : Pseudo<(outs GPR32:$dst), (ins GPR32:$tval, GPR32:$fval, i32imm:$cc),
+               [(set (i32 GPR32:$dst), (AArch64ctselect GPR32:$tval,
+                                           GPR32:$fval, (i32 imm:$cc), NZCV))]>;
+  def I64CTSELECT
+      : Pseudo<(outs GPR64:$dst), (ins GPR64:$tval, GPR64:$fval, i32imm:$cc),
+               [(set (i64 GPR64:$dst), (AArch64ctselect GPR64:$tval,
+                                           GPR64:$fval, (i32 imm:$cc), NZCV))]>;
+  let Predicates = [HasFullFP16] in {
+    def F16CTSELECT
+        : Pseudo<(outs FPR16:$dst), (ins FPR16:$tval, FPR16:$fval, i32imm:$cc),
+                 [(set (f16 FPR16:$dst), (AArch64ctselect (f16 FPR16:$tval),
+                                             (f16 FPR16:$fval), (i32 imm:$cc),
+                                             NZCV))]>;
+    def BF16CTSELECT
+        : Pseudo<(outs FPR16:$dst), (ins FPR16:$tval, FPR16:$fval, i32imm:$cc),
+                 [(set (bf16 FPR16:$dst), (AArch64ctselect (bf16 FPR16:$tval),
+                                              (bf16 FPR16:$fval), (i32 imm:$cc),
+                                              NZCV))]>;
+  }
+  def F32CTSELECT
+      : Pseudo<(outs FPR32:$dst), (ins FPR32:$tval, FPR32:$fval, i32imm:$cc),
+               [(set (f32 FPR32:$dst), (AArch64ctselect FPR32:$tval,
+                                           FPR32:$fval, (i32 imm:$cc), NZCV))]>;
+  def F64CTSELECT
+      : Pseudo<(outs FPR64:$dst), (ins FPR64:$tval, FPR64:$fval, i32imm:$cc),
+               [(set (f64 FPR64:$dst), (AArch64ctselect FPR64:$tval,
+                                           FPR64:$fval, (i32 imm:$cc), NZCV))]>;
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions used for emitting unwind opcodes on ARM64 Windows.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 39946633603f6..e2ec9118eb5ee 100644
--- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -393,5 +393,23 @@ void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     OutMI.setOpcode(AArch64::RET);
     OutMI.addOperand(MCOperand::createReg(AArch64::LR));
     break;
+  case AArch64::I32CTSELECT:
+    OutMI.setOpcode(AArch64::CSELWr);
+    break;
+  case AArch64::I64CTSELECT:
+    OutMI.setOpcode(AArch64::CSELXr);
+    break;
+  case AArch64::BF16CTSELECT:
+    OutMI.setOpcode(AArch64::FCSELHrrr);
+    break;
+  case AArch64::F16CTSELECT:
+    OutMI.setOpcode(AArch64::FCSELHrrr);
+    break;
+  case AArch64::F32CTSELECT:
+    OutMI.setOpcode(AArch64::FCSELSrrr);
+    break;
+  case AArch64::F64CTSELECT:
+    OutMI.setOpcode(AArch64::FCSELDrrr);
+    break;
   }
 }
diff --git a/llvm/test/CodeGen/AArch64/ctselect.ll b/llvm/test/CodeGen/AArch64/ctselect.ll
new file mode 100644
index 0000000000000..77e9cf24e56cf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ctselect.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-eabi | FileCheck %s --check-prefixes=DEFAULT,NOFP16
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefixes=DEFAULT,FP16
+
+define i1 @ct_i1(i1 %cond, i1 %a, i1 %b) {
+; DEFAULT-LABEL: ct_i1:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    tst w0, #0x1
+; DEFAULT-NEXT:    csel w8, w1, w2, ne
+; DEFAULT-NEXT:    and w0, w8, #0x1
+; DEFAULT-NEXT:    ret
+  %1 = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+  ret i1 %1
+}
+
+define i8 @ct_i8(i1 %cond, i8 %a, i8 %b) {
+; DEFAULT-LABEL: ct_i8:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    tst w0, #0x1
+; DEFAULT-NEXT:    csel w0, w1, w2, ne
+; DEFAULT-NEXT:    ret
+  %1 = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %1
+}
+
+define i16 @ct_i16(i1 %cond, i16 %a, i16 %b) {
+; DEFAULT-LABEL: ct_i16:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    tst w0, #0x1
+; DEFAULT-NEXT:    csel w0, w1, w2, ne
+; DEFAULT-NEXT:    ret
+  %1 = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %1
+}
+
+define i32 @ct_i32(i1 %cond, i32 %a, i32 %b) {
+; DEFAULT-LABEL: ct_i32:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    tst w0, #0x1
+; DEFAULT-NEXT:    csel w0, w1, w2, ne
+; DEFAULT-NEXT:    ret
+  %1 = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %1
+}
+
+define i64 @ct_i64(i1 %cond, i64 %a, i64 %b) {
+; DEFAULT-LABEL: ct_i64:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    tst w0, #0x1
+; DEFAULT-NEXT:    csel x0, x1, x2, ne
+; DEFAULT-NEXT:    ret
+  %1 = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+  ret i64 %1
+}
+
+define i128 @ct_i128(i1 %cond, i128 %a, i128 %b) {
+; DEFAULT-LABEL: ct_i128:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    tst w0, #0x1
+; DEFAULT-NEXT:    csel x0, x2, x4, ne
+; DEFAULT-NEXT:    csel x1, x3, x5, ne
+; DEFAULT-NEXT:    ret
+  %1 = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b)
+  ret i128 %1
+}
+
+define half @ct_f16(i1 %cond, half %a, half %b) {
+; NOFP16-LABEL: ct_f16:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    fcvt s1, h1
+; NOFP16-NEXT:    fcvt s0, h0
+; NOFP16-NEXT:    tst w0, #0x1
+; NOFP16-NEXT:    fcsel s0, s0, s1, ne
+; NOFP16-NEXT:    fcvt h0, s0
+; NOFP16-NEXT:    ret
+;
+; FP16-LABEL: ct_f16:
+; FP16:       // %bb.0:
+; FP16-NEXT:    tst w0, #0x1
+; FP16-NEXT:    fcsel h0, h0, h1, ne
+; FP16-NEXT:    ret
+  %1 = call half @llvm.ct.select.f16(i1 %cond, half %a, half %b)
+  ret half %1
+}
+
+define float @ct_f32(i1 %cond, float %a, float %b) {
+; DEFAULT-LABEL: ct_f32:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    tst w0, #0x1
+; DEFAULT-NEXT:    fcsel s0, s0, s1, ne
+; DEFAULT-NEXT:    ret
+  %1 = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %1
+}
+
+define double @ct_f64(i1 %cond, double %a, double %b) {
+; DEFAULT-LABEL: ct_f64:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    tst w0, #0x1
+; DEFAULT-NEXT:    fcsel d0, d0, d1, ne
+; DEFAULT-NEXT:    ret
+  %1 = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %1
+}
+
+define <4 x i32> @ct_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; DEFAULT-LABEL: ct_v4i32:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    mov w8, v1.s[1]
+; DEFAULT-NEXT:    mov w9, v0.s[1]
+; DEFAULT-NEXT:    tst w0, #0x1
+; DEFAULT-NEXT:    fmov w10, s1
+; DEFAULT-NEXT:    fmov w11, s0
+; DEFAULT-NEXT:    csel w8, w9, w8, ne
+; DEFAULT-NEXT:    csel w9, w11, w10, ne
+; DEFAULT-NEXT:    mov w10, v1.s[2]
+; DEFAULT-NEXT:    fmov s2, w9
+; DEFAULT-NEXT:    mov w11, v0.s[2]
+; DEFAULT-NEXT:    mov w9, v0.s[3]
+; DEFAULT-NEXT:    mov v2.s[1], w8
+; DEFAULT-NEXT:    mov w8, v1.s[3]
+; DEFAULT-NEXT:    csel w10, w11, w10, ne
+; DEFAULT-NEXT:    mov v2.s[2], w10
+; DEFAULT-NEXT:    csel w8, w9, w8, ne
+; DEFAULT-NEXT:    mov v2.s[3], w8
+; DEFAULT-NEXT:    mov v0.16b, v2.16b
+; DEFAULT-NEXT:    ret
+  %1 = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %1
+}
+
+define <4 x float> @ct_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; DEFAULT-LABEL: ct_v4f32:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    mov s2, v1.s[1]
+; DEFAULT-NEXT:    mov s3, v0.s[1]
+; DEFAULT-NEXT:    tst w0, #0x1
+; DEFAULT-NEXT:    mov s4, v1.s[2]
+; DEFAULT-NEXT:    mov s5, v0.s[2]
+; DEFAULT-NEXT:    fcsel s3, s3, s2, ne
+; DEFAULT-NEXT:    fcsel s2, s0, s1, ne
+; DEFAULT-NEXT:    mov s1, v1.s[3]
+; DEFAULT-NEXT:    mov s0, v0.s[3]
+; DEFAULT-NEXT:    mov v2.s[1], v3.s[0]
+; DEFAULT-NEXT:    fcsel s3, s5, s4, ne
+; DEFAULT-NEXT:    fcsel s0, s0, s1, ne
+; DEFAULT-NEXT:    mov v2.s[2], v3.s[0]
+; DEFAULT-NEXT:    mov v2.s[3], v0.s[0]
+; DEFAULT-NEXT:    mov v0.16b, v2.16b
+; DEFAULT-NEXT:    ret
+  %1 = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+  ret <4 x float> %1
+}