[llvm] [ConstantTime][LLVM] Add llvm.ct.select intrinsic with generic SelectionDAG lowering (PR #166702)

Sat Mar 7 13:29:41 PST 2026

https://github.com/wizardengineer updated https://github.com/llvm/llvm-project/pull/166702

>From f5bcda493e973a22b2e3ffc1d7dd7af097430590 Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 10:51:08 -0500
Subject: [PATCH 1/6] [ConstantTime][LLVM] Add llvm.ct.select intrinsic with
 generic SelectionDAG lowering

---
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |   4 +
 llvm/include/llvm/CodeGen/SelectionDAG.h      |   7 +
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h |   4 +-
 llvm/include/llvm/CodeGen/TargetLowering.h    |  18 +-
 llvm/include/llvm/IR/Intrinsics.td            |   9 +
 .../include/llvm/Target/TargetSelectionDAG.td |   6 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 112 ++-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  46 +-
 .../SelectionDAG/LegalizeFloatTypes.cpp       |  17 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  20 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   6 +-
 .../SelectionDAG/LegalizeTypesGeneric.cpp     |  14 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  13 +
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   1 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 131 +++
 .../SelectionDAG/SelectionDAGBuilder.h        |   3 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   1 +
 llvm/test/CodeGen/RISCV/ctselect-fallback.ll  | 330 ++++++++
 llvm/test/CodeGen/X86/ctselect.ll             | 779 ++++++++++++++++++
 19 files changed, 1507 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/ctselect-fallback.ll
 create mode 100644 llvm/test/CodeGen/X86/ctselect.ll

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index b32f3dacbb3a4..e843a78549315 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -783,6 +783,10 @@ enum NodeType {
   /// i1 then the high bits must conform to getBooleanContents.
   SELECT,
 
+  /// Constant-time Select, implemented with CMOV instruction. This is used to
+  /// implement constant-time select.
+  CTSELECT,
+
   /// Select with a vector condition (op #0) and two vector operands (ops #1
   /// and #2), returning a vector result.  All vectors have the same length.
   /// Much like the scalar select and setcc, each bit in the condition selects
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 18dee551a76b2..0bc63a0315fb8 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1366,6 +1366,13 @@ class SelectionDAG {
     return getNode(Opcode, DL, VT, Cond, LHS, RHS, Flags);
   }
 
+  SDValue getCTSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS,
+                      SDValue RHS, SDNodeFlags Flags = SDNodeFlags()) {
+    assert(LHS.getValueType() == VT && RHS.getValueType() == VT &&
+           "Cannot use select on differing types");
+    return getNode(ISD::CTSELECT, DL, VT, Cond, LHS, RHS, Flags);
+  }
+
   /// Helper function to make it easier to build SelectCC's if you just have an
   /// ISD::CondCode instead of an SDValue.
   SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index aa72e81b2ab54..8a9e3335b2453 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -435,6 +435,9 @@ struct SDNodeFlags {
                             NonNeg | NoNaNs | NoInfs | SameSign | InBounds,
     FastMathFlags = NoNaNs | NoInfs | NoSignedZeros | AllowReciprocal |
                     AllowContract | ApproximateFuncs | AllowReassociation,
+
+    // Flag for disabling optimization
+    NoMerge = 1 << 15,
   };
 
   /// Default constructor turns off all optimization flags.
@@ -486,7 +489,6 @@ struct SDNodeFlags {
   bool hasNoFPExcept() const { return Flags & NoFPExcept; }
   bool hasUnpredictable() const { return Flags & Unpredictable; }
   bool hasInBounds() const { return Flags & InBounds; }
-
   bool operator==(const SDNodeFlags &Other) const {
     return Flags == Other.Flags;
   }
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 621cef5ef6230..c1714c3cc073f 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -242,11 +242,15 @@ class LLVM_ABI TargetLoweringBase {
 
   /// Enum that describes what type of support for selects the target has.
   enum SelectSupportKind {
-    ScalarValSelect,      // The target supports scalar selects (ex: cmov).
-    ScalarCondVectorVal,  // The target supports selects with a scalar condition
-                          // and vector values (ex: cmov).
-    VectorMaskSelect      // The target supports vector selects with a vector
-                          // mask (ex: x86 blends).
+    ScalarValSelect,     // The target supports scalar selects (ex: cmov).
+    ScalarCondVectorVal, // The target supports selects with a scalar condition
+                         // and vector values (ex: cmov).
+    VectorMaskSelect,    // The target supports vector selects with a vector
+                         // mask (ex: x86 blends).
+    CtSelect,            // The target implements a custom constant-time select.
+    ScalarCondVectorValCtSelect, // The target supports selects with a scalar
+                                 // condition and vector values.
+    VectorMaskValCtSelect, // The target supports vector selects with a vector
   };
 
   /// Enum that specifies what an atomic load/AtomicRMWInst is expanded
@@ -477,8 +481,8 @@ class LLVM_ABI TargetLoweringBase {
   MachineMemOperand::Flags
   getVPIntrinsicMemOperandFlags(const VPIntrinsic &VPIntrin) const;
 
-  virtual bool isSelectSupported(SelectSupportKind /*kind*/) const {
-    return true;
+  virtual bool isSelectSupported(SelectSupportKind kind) const {
+    return kind != CtSelect;
   }
 
   /// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index c3c4718c3548f..4fe2736b78023 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1863,6 +1863,15 @@ def int_coro_subfn_addr : DefaultAttrsIntrinsic<
     [IntrReadMem, IntrArgMemOnly, ReadOnly<ArgIndex<0>>,
      NoCapture<ArgIndex<0>>]>;
 
+///===-------------------------- Constant Time Intrinsics
+///--------------------------===//
+//
+// Intrinsic to support constant time select
+def int_ct_select
+    : DefaultAttrsIntrinsic<[llvm_any_ty],
+                            [llvm_i1_ty, LLVMMatchType<0>, LLVMMatchType<0>],
+                            [IntrWriteMem, IntrWillReturn, NoUndef<RetIndex>]>;
+
 ///===-------------------------- Other Intrinsics --------------------------===//
 //
 // TODO: We should introduce a new memory kind fo traps (and other side effects
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index a9750a5ab03f9..e3d424e51ca48 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -214,6 +214,11 @@ def SDTSelect : SDTypeProfile<1, 3, [       // select
   SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>
 ]>;
 
+def SDTCtSelect
+    : SDTypeProfile<1, 3,
+                    [ // ctselect
+                        SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
+
 def SDTVSelect : SDTypeProfile<1, 3, [       // vselect
   SDTCisVec<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameNumEltsAs<0, 1>
 ]>;
@@ -719,6 +724,7 @@ def reset_fpmode   : SDNode<"ISD::RESET_FPMODE", SDTNone, [SDNPHasChain]>;
 
 def setcc      : SDNode<"ISD::SETCC"      , SDTSetCC>;
 def select     : SDNode<"ISD::SELECT"     , SDTSelect>;
+def ctselect : SDNode<"ISD::CTSELECT", SDTCtSelect>;
 def vselect    : SDNode<"ISD::VSELECT"    , SDTVSelect>;
 def selectcc   : SDNode<"ISD::SELECT_CC"  , SDTSelectCC>;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 126867a6eac96..7cec739015756 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -484,6 +484,7 @@ namespace {
     SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
     SDValue visitCTPOP(SDNode *N);
     SDValue visitSELECT(SDNode *N);
+    SDValue visitCTSELECT(SDNode *N);
     SDValue visitVSELECT(SDNode *N);
     SDValue visitVP_SELECT(SDNode *N);
     SDValue visitSELECT_CC(SDNode *N);
@@ -1904,6 +1905,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
 }
 
 SDValue DAGCombiner::visit(SDNode *N) {
+
   // clang-format off
   switch (N->getOpcode()) {
   default: break;
@@ -1974,6 +1976,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::CTTZ_ZERO_UNDEF:    return visitCTTZ_ZERO_UNDEF(N);
   case ISD::CTPOP:              return visitCTPOP(N);
   case ISD::SELECT:             return visitSELECT(N);
+  case ISD::CTSELECT:           return visitCTSELECT(N);
   case ISD::VSELECT:            return visitVSELECT(N);
   case ISD::SELECT_CC:          return visitSELECT_CC(N);
   case ISD::SETCC:              return visitSETCC(N);
@@ -6047,6 +6050,7 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
     N0CC = cast<CondCodeSDNode>(N0.getOperand(4))->get();
     break;
   case ISD::SELECT:
+  case ISD::CTSELECT:
   case ISD::VSELECT:
     if (N0.getOperand(0).getOpcode() != ISD::SETCC)
       return SDValue();
@@ -12215,8 +12219,9 @@ template <class MatchContextClass>
 static SDValue foldBoolSelectToLogic(SDNode *N, const SDLoc &DL,
                                      SelectionDAG &DAG) {
   assert((N->getOpcode() == ISD::SELECT || N->getOpcode() == ISD::VSELECT ||
-          N->getOpcode() == ISD::VP_SELECT) &&
-         "Expected a (v)(vp.)select");
+          N->getOpcode() == ISD::VP_SELECT ||
+          N->getOpcode() == ISD::CTSELECT) &&
+         "Expected a (v)(vp.)(ct) select");
   SDValue Cond = N->getOperand(0);
   SDValue T = N->getOperand(1), F = N->getOperand(2);
   EVT VT = N->getValueType(0);
@@ -12578,6 +12583,109 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitCTSELECT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  EVT VT0 = N0.getValueType();
+  SDLoc DL(N);
+  SDNodeFlags Flags = N->getFlags();
+
+  if (SDValue V = foldBoolSelectToLogic<EmptyMatchContext>(N, DL, DAG))
+    return V;
+
+  // ctselect (not Cond), N1, N2 -> ctselect Cond, N2, N1
+  if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) {
+    SDValue SelectOp = DAG.getNode(ISD::CTSELECT, DL, VT, F, N2, N1);
+    SelectOp->setFlags(Flags);
+    return SelectOp;
+  }
+
+  if (VT0 == MVT::i1) {
+    // The code in this block deals with the following 2 equivalences:
+    //    select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y))
+    //    select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
+    // The target can specify its preferred form with the
+    // shouldNormalizeToSelectSequence() callback. However we always transform
+    // to the right anyway if we find the inner select exists in the DAG anyway
+    // and we always transform to the left side if we know that we can further
+    // optimize the combination of the conditions.
+    bool normalizeToSequence =
+        TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
+    // ctselect (and Cond0, Cond1), X, Y
+    //   -> ctselect Cond0, (ctselect Cond1, X, Y), Y
+    if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
+      SDValue Cond0 = N0->getOperand(0);
+      SDValue Cond1 = N0->getOperand(1);
+      SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(),
+                                        Cond1, N1, N2, Flags);
+      if (normalizeToSequence || !InnerSelect.use_empty())
+        return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0,
+                           InnerSelect, N2, Flags);
+      // Cleanup on failure.
+      if (InnerSelect.use_empty())
+        recursivelyDeleteUnusedNodes(InnerSelect.getNode());
+    }
+    // ctselect (or Cond0, Cond1), X, Y -> ctselect Cond0, X, (ctselect Cond1,
+    // X, Y)
+    if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
+      SDValue Cond0 = N0->getOperand(0);
+      SDValue Cond1 = N0->getOperand(1);
+      SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(),
+                                        Cond1, N1, N2, Flags);
+      if (normalizeToSequence || !InnerSelect.use_empty())
+        return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0, N1,
+                           InnerSelect, Flags);
+      // Cleanup on failure.
+      if (InnerSelect.use_empty())
+        recursivelyDeleteUnusedNodes(InnerSelect.getNode());
+    }
+
+    // ctselect Cond0, (ctselect Cond1, X, Y), Y -> ctselect (and Cond0, Cond1),
+    // X, Y
+    if (N1->getOpcode() == ISD::CTSELECT && N1->hasOneUse()) {
+      SDValue N1_0 = N1->getOperand(0);
+      SDValue N1_1 = N1->getOperand(1);
+      SDValue N1_2 = N1->getOperand(2);
+      if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
+        // Create the actual and node if we can generate good code for it.
+        if (!normalizeToSequence) {
+          SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
+          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), And, N1_1,
+                             N2, Flags);
+        }
+        // Otherwise see if we can optimize the "and" to a better pattern.
+        if (SDValue Combined = visitANDLike(N0, N1_0, N)) {
+          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined,
+                             N1_1, N2, Flags);
+        }
+      }
+    }
+    // ctselect Cond0, X, (ctselect Cond1, X, Y) -> ctselect (or Cond0, Cond1),
+    // X, Y
+    if (N2->getOpcode() == ISD::CTSELECT && N2->hasOneUse()) {
+      SDValue N2_0 = N2->getOperand(0);
+      SDValue N2_1 = N2->getOperand(1);
+      SDValue N2_2 = N2->getOperand(2);
+      if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
+        // Create the actual or node if we can generate good code for it.
+        if (!normalizeToSequence) {
+          SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
+          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Or, N1, N2_2,
+                             Flags);
+        }
+        // Otherwise see if we can optimize to a better pattern.
+        if (SDValue Combined = visitORLike(N0, N2_0, DL))
+          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined, N1,
+                             N2_2, Flags);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 // This function assumes all the vselect's arguments are CONCAT_VECTOR
 // nodes and that the condition is a BV of ConstantSDNodes (or undefs).
 static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index c46264e09bc46..ae5a47fa74844 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4243,6 +4243,46 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     }
     Results.push_back(Tmp1);
     break;
+  case ISD::CTSELECT: {
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = Node->getOperand(2);
+    EVT VT = Tmp2.getValueType();
+    if (VT.isVector()) {
+      SmallVector<SDValue> Elements;
+      unsigned NumElements = VT.getVectorNumElements();
+      EVT ScalarVT = VT.getScalarType();
+      for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+        SDValue IdxVal = DAG.getConstant(Idx, dl, MVT::i64);
+        SDValue TVal =
+            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp2, IdxVal);
+        SDValue FVal =
+            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp3, IdxVal);
+        Elements.push_back(
+            DAG.getCTSelect(dl, ScalarVT, Tmp1, TVal, FVal, Node->getFlags()));
+      }
+      Tmp1 = DAG.getBuildVector(VT, dl, Elements);
+    } else if (VT.isFloatingPoint()) {
+      EVT IntegerVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+      Tmp2 = DAG.getBitcast(IntegerVT, Tmp2);
+      Tmp3 = DAG.getBitcast(IntegerVT, Tmp3);
+      Tmp1 = DAG.getBitcast(VT, DAG.getCTSelect(dl, IntegerVT, Tmp1, Tmp2, Tmp3,
+                                                Node->getFlags()));
+    } else {
+      assert(VT.isInteger());
+      EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+      auto [Tmp2Lo, Tmp2Hi] = DAG.SplitScalar(Tmp2, dl, HalfVT, HalfVT);
+      auto [Tmp3Lo, Tmp3Hi] = DAG.SplitScalar(Tmp3, dl, HalfVT, HalfVT);
+      SDValue ResLo =
+          DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Lo, Tmp3Lo, Node->getFlags());
+      SDValue ResHi =
+          DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Hi, Tmp3Hi, Node->getFlags());
+      Tmp1 = DAG.getNode(ISD::BUILD_PAIR, dl, VT, ResLo, ResHi);
+      Tmp1->setFlags(Node->getFlags());
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
   case ISD::BR_JT: {
     SDValue Chain = Node->getOperand(0);
     SDValue Table = Node->getOperand(1);
@@ -5599,7 +5639,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp2));
     break;
   }
-  case ISD::SELECT: {
+  case ISD::SELECT:
+  case ISD::CTSELECT: {
     unsigned ExtOp, TruncOp;
     if (Node->getValueType(0).isVector() ||
         Node->getValueType(0).getSizeInBits() == NVT.getSizeInBits()) {
@@ -5617,7 +5658,8 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
     Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
     // Perform the larger operation, then round down.
-    Tmp1 = DAG.getSelect(dl, NVT, Tmp1, Tmp2, Tmp3);
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, Tmp3);
+    Tmp1->setFlags(Node->getFlags());
     if (TruncOp != ISD::FP_ROUND)
       Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1);
     else
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 545b7f5120d35..6be66663e5c74 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -159,6 +159,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::ATOMIC_LOAD: R = SoftenFloatRes_ATOMIC_LOAD(N); break;
     case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
     case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
+    case ISD::CTSELECT:    R = SoftenFloatRes_CTSELECT(N); break;
     case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
     case ISD::FREEZE:      R = SoftenFloatRes_FREEZE(N); break;
     case ISD::STRICT_SINT_TO_FP:
@@ -1044,6 +1045,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
                        LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_CTSELECT(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(1));
+  SDValue RHS = GetSoftenedFloat(N->getOperand(2));
+  return DAG.getCTSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS,
+                         RHS);
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(2));
   SDValue RHS = GetSoftenedFloat(N->getOperand(3));
@@ -1564,6 +1572,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::POISON:
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
+  case ISD::CTSELECT:     SplitRes_Select(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
 
   case ISD::MERGE_VALUES:       ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
@@ -2920,6 +2929,9 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
       R = PromoteFloatRes_ATOMIC_LOAD(N);
       break;
     case ISD::SELECT:     R = PromoteFloatRes_SELECT(N); break;
+    case ISD::CTSELECT:
+      R = PromoteFloatRes_SELECT(N);
+      break;
     case ISD::SELECT_CC:  R = PromoteFloatRes_SELECT_CC(N); break;
 
     case ISD::SINT_TO_FP:
@@ -3222,7 +3234,7 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT(SDNode *N) {
   SDValue TrueVal = GetPromotedFloat(N->getOperand(1));
   SDValue FalseVal = GetPromotedFloat(N->getOperand(2));
 
-  return DAG.getNode(ISD::SELECT, SDLoc(N), TrueVal->getValueType(0),
+  return DAG.getNode(N->getOpcode(), SDLoc(N), TrueVal->getValueType(0),
                      N->getOperand(0), TrueVal, FalseVal);
 }
 
@@ -3406,6 +3418,9 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
     R = SoftPromoteHalfRes_ATOMIC_LOAD(N);
     break;
   case ISD::SELECT:      R = SoftPromoteHalfRes_SELECT(N); break;
+  case ISD::CTSELECT:
+    R = SoftPromoteHalfRes_SELECT(N);
+    break;
   case ISD::SELECT_CC:   R = SoftPromoteHalfRes_SELECT_CC(N); break;
   case ISD::STRICT_SINT_TO_FP:
   case ISD::STRICT_UINT_TO_FP:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 08c6b90cc8a74..ac20f8a009600 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -95,6 +95,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     Res = PromoteIntRes_VECTOR_COMPRESS(N);
     break;
   case ISD::SELECT:
+  case ISD::CTSELECT:
   case ISD::VSELECT:
   case ISD::VP_SELECT:
   case ISD::VP_MERGE:
@@ -2013,6 +2014,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
     break;
   case ISD::VSELECT:
   case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
+  case ISD::CTSELECT:
+    Res = PromoteIntOp_CTSELECT(N, OpNo);
+    break;
   case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
   case ISD::VP_SETCC:
   case ISD::SETCC:        Res = PromoteIntOp_SETCC(N, OpNo); break;
@@ -2390,6 +2394,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
                                         N->getOperand(2)), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_CTSELECT(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Only know how to promote the condition!");
+  SDValue Cond = N->getOperand(0);
+  EVT OpTy = N->getOperand(1).getValueType();
+
+  // Promote all the way up to the canonical SetCC type.
+  EVT OpVT = N->getOpcode() == ISD::CTSELECT ? OpTy.getScalarType() : OpTy;
+  Cond = PromoteTargetBoolean(Cond, OpVT);
+
+  return SDValue(
+      DAG.UpdateNodeOperands(N, Cond, N->getOperand(1), N->getOperand(2)), 0);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
   assert(OpNo == 0 && "Don't know how to promote this operand!");
 
@@ -2988,6 +3005,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ARITH_FENCE:  SplitRes_ARITH_FENCE(N, Lo, Hi); break;
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
+  case ISD::CTSELECT:
+    SplitRes_Select(N, Lo, Hi);
+    break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::POISON:
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ede522eff6df3..62069b4fb03a3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -401,6 +401,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
   SDValue PromoteIntOp_ScalarOp(SDNode *N);
   SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_CTSELECT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_Shift(SDNode *N);
@@ -633,6 +634,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftenFloatRes_LOAD(SDNode *N);
   SDValue SoftenFloatRes_ATOMIC_LOAD(SDNode *N);
   SDValue SoftenFloatRes_SELECT(SDNode *N);
+  SDValue SoftenFloatRes_CTSELECT(SDNode *N);
   SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
   SDValue SoftenFloatRes_UNDEF(SDNode *N);
   SDValue SoftenFloatRes_VAARG(SDNode *N);
@@ -896,6 +898,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_VSELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT(SDNode *N);
+  SDValue ScalarizeVecRes_CTSELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT_CC(SDNode *N);
   SDValue ScalarizeVecRes_SETCC(SDNode *N);
   SDValue ScalarizeVecRes_UNDEF(SDNode *N);
@@ -1224,7 +1227,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
                              SDValue &Lo, SDValue &Hi);
   void SplitVecRes_AssertZext  (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_ARITH_FENCE (SDNode *N, SDValue &Lo, SDValue &Hi);
-  void SplitRes_Select      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_FREEZE      (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 88c1af20a321e..098368ef2f6b3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -570,6 +570,20 @@ void DAGTypeLegalizer::SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi) {
   Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH, EVLHi);
 }
 
+void DAGTypeLegalizer::SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue LL, LH, RL, RH, CL, CH;
+  SDLoc dl(N);
+  GetSplitOp(N->getOperand(1), LL, LH);
+  GetSplitOp(N->getOperand(2), RL, RH);
+
+  SDValue Cond = N->getOperand(0);
+  CL = CH = Cond;
+  assert(!Cond.getValueType().isVector() && "Unsupported vector type");
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL);
+  Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH);
+}
+
 void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
                                           SDValue &Hi) {
   SDValue LL, LH, RL, RH;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 6e1e02f38113e..561e0ff4d5d09 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -77,6 +77,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
   case ISD::VSELECT:           R = ScalarizeVecRes_VSELECT(N); break;
   case ISD::SELECT:            R = ScalarizeVecRes_SELECT(N); break;
+  case ISD::CTSELECT:
+    R = ScalarizeVecRes_CTSELECT(N);
+    break;
   case ISD::SELECT_CC:         R = ScalarizeVecRes_SELECT_CC(N); break;
   case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
   case ISD::POISON:
@@ -670,6 +673,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
                        GetScalarizedVector(N->getOperand(2)));
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_CTSELECT(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(1));
+  return DAG.getCTSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS,
+                         GetScalarizedVector(N->getOperand(2)));
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(2));
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(),
@@ -1204,6 +1213,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SELECT:
   case ISD::VP_MERGE:
   case ISD::VP_SELECT:    SplitRes_Select(N, Lo, Hi); break;
+  case ISD::CTSELECT:
+    SplitRes_CTSELECT(N, Lo, Hi);
+    break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::POISON:
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
@@ -4881,6 +4893,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
   case ISD::VSELECT:
   case ISD::SELECT:
+  case ISD::CTSELECT:
   case ISD::VP_SELECT:
   case ISD::VP_MERGE:
     Res = WidenVecRes_Select(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b7e78b94c0687..6eee61da2e0d1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8179,6 +8179,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       return V;
     break;
   }
+
   case ISD::SELECT:
   case ISD::VSELECT:
     if (SDValue V = simplifySelect(N1, N2, N3))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 09a0673bfe1bb..eaecb87979648 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6580,6 +6580,105 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I,
   setValue(&I, Result);
 }
 
+/// Fallback implementation for constant-time select using DAG chaining.
+/// This implementation uses data dependencies through virtual registers to
+/// prevent optimizations from breaking the constant-time property.
+/// It handles scalars, vectors (fixed and scalable), and floating-point types.
+SDValue SelectionDAGBuilder::createProtectedCtSelectFallback(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, SDValue T, SDValue F,
+    EVT VT) {
+
+  SDValue WorkingT = T;
+  SDValue WorkingF = F;
+  EVT WorkingVT = VT;
+
+  SDValue Chain = DAG.getEntryNode();
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+
+  // Handle vector condition: splat scalar condition to vector
+  if (VT.isVector() && !Cond.getValueType().isVector()) {
+    ElementCount NumElems = VT.getVectorElementCount();
+    EVT CondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElems);
+
+    if (VT.isScalableVector()) {
+      Cond = DAG.getSplatVector(CondVT, DL, Cond);
+    } else {
+      Cond = DAG.getSplatBuildVector(CondVT, DL, Cond);
+    }
+  }
+
+  // Handle floating-point types: bitcast to integer for bitwise operations
+  if (VT.isFloatingPoint()) {
+    if (VT.isVector()) {
+      // float vector -> int vector
+      EVT ElemVT = VT.getVectorElementType();
+      unsigned int ElemBitWidth = ElemVT.getScalarSizeInBits();
+      EVT IntElemVT = EVT::getIntegerVT(*DAG.getContext(), ElemBitWidth);
+
+      WorkingVT = EVT::getVectorVT(*DAG.getContext(), IntElemVT,
+                                   VT.getVectorElementCount());
+    } else {
+      WorkingVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+    }
+
+    WorkingT = DAG.getBitcast(WorkingVT, T);
+    WorkingF = DAG.getBitcast(WorkingVT, F);
+  }
+
+  // Create mask: sign-extend condition to all bits
+  SDValue Mask = DAG.getSExtOrTrunc(Cond, DL, WorkingVT);
+
+  // Create all-ones constant for inversion
+  SDValue AllOnes;
+  if (WorkingVT.isScalableVector()) {
+    unsigned BitWidth = WorkingVT.getScalarSizeInBits();
+    APInt AllOnesVal = APInt::getAllOnes(BitWidth);
+    SDValue ScalarAllOnes =
+        DAG.getConstant(AllOnesVal, DL, WorkingVT.getScalarType());
+    AllOnes = DAG.getSplatVector(WorkingVT, DL, ScalarAllOnes);
+  } else {
+    AllOnes = DAG.getAllOnesConstant(DL, WorkingVT);
+  }
+
+  // Invert mask for false value
+  SDValue Invert = DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes);
+
+  // Compute: (T & Mask) | (F & ~Mask)
+  // This is constant-time because both branches are always computed
+  SDValue TM = DAG.getNode(ISD::AND, DL, WorkingVT, Mask, WorkingT);
+
+  // DAG chaining: create data dependency through virtual register
+  // This prevents optimizations from reordering or eliminating operations
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  bool CanUseChaining = false;
+
+  if (!WorkingVT.isScalableVector()) {
+    // For fixed-size vectors and scalars, check if type is legal
+    CanUseChaining = TLI.isTypeLegal(WorkingVT.getSimpleVT());
+  } else {
+    // For scalable vectors, disable chaining (conservative approach)
+    CanUseChaining = false;
+  }
+
+  if (CanUseChaining) {
+    // Apply chaining through registers for additional protection
+    const TargetRegisterClass *RC = TLI.getRegClassFor(WorkingVT.getSimpleVT());
+    Register TMReg = MRI.createVirtualRegister(RC);
+    Chain = DAG.getCopyToReg(Chain, DL, TMReg, TM);
+    TM = DAG.getCopyFromReg(Chain, DL, TMReg, WorkingVT);
+  }
+
+  SDValue FM = DAG.getNode(ISD::AND, DL, WorkingVT, Invert, WorkingF);
+  SDValue Result = DAG.getNode(ISD::OR, DL, WorkingVT, TM, FM);
+
+  // Convert back to original type if needed
+  if (WorkingVT != VT) {
+    Result = DAG.getBitcast(VT, Result);
+  }
+
+  return Result;
+}
+
 /// Lower the call to the specified intrinsic function.
 void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                                              unsigned Intrinsic) {
@@ -6758,6 +6857,38 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     updateDAGForMaybeTailCall(MC);
     return;
   }
+  case Intrinsic::ct_select: {
+    // Set function attribute to indicate ct.select usage
+    Function &F = DAG.getMachineFunction().getFunction();
+    F.addFnAttr("ct-select");
+
+    SDLoc DL = getCurSDLoc();
+
+    SDValue Cond = getValue(I.getArgOperand(0)); // i1
+    SDValue A = getValue(I.getArgOperand(1));    // T
+    SDValue B = getValue(I.getArgOperand(2));    // T
+
+    assert((A.getValueType() == B.getValueType()) &&
+           "Operands are of different types");
+
+    EVT VT = A.getValueType();
+    EVT CondVT = Cond.getValueType();
+
+    // assert if Cond type is Vector
+    assert(!CondVT.isVector() && "Vector type cond not supported yet");
+
+    // Handle scalar types
+    if (TLI.isSelectSupported(
+            TargetLoweringBase::SelectSupportKind::CtSelect) &&
+        !CondVT.isVector()) {
+      SDValue Result = DAG.getNode(ISD::CTSELECT, DL, VT, Cond, A, B);
+      setValue(&I, Result);
+      return;
+    }
+
+    setValue(&I, createProtectedCtSelectFallback(DAG, DL, Cond, A, B, VT));
+    return;
+  }
   case Intrinsic::call_preallocated_setup: {
     const CallBase *PreallocatedCall = FindPreallocatedCall(&I);
     SDValue SrcValue = DAG.getSrcValue(PreallocatedCall);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 6f3e7a63bb170..1c9b26ae80062 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -219,6 +219,9 @@ class SelectionDAGBuilder {
   peelDominantCaseCluster(const SwitchInst &SI,
                           SwitchCG::CaseClusterVector &Clusters,
                           BranchProbability &PeeledCaseProb);
+  SDValue createProtectedCtSelectFallback(SelectionDAG &DAG, const SDLoc &DL,
+                                          SDValue Cond, SDValue T, SDValue F,
+                                          EVT VT);
 
 private:
   const TargetMachine &TM;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index ec5edd5f13978..f6858c64ba7e3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -332,6 +332,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FPOWI:                      return "fpowi";
   case ISD::STRICT_FPOWI:               return "strict_fpowi";
   case ISD::SETCC:                      return "setcc";
+  case ISD::CTSELECT:                   return "ctselect";
   case ISD::SETCCCARRY:                 return "setcccarry";
   case ISD::STRICT_FSETCC:              return "strict_fsetcc";
   case ISD::STRICT_FSETCCS:             return "strict_fsetccs";
diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
new file mode 100644
index 0000000000000..f46bde0a05b8b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
@@ -0,0 +1,330 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -O3 | FileCheck %s --check-prefix=RV64
+; RUN: llc < %s -mtriple=riscv32 -O3 | FileCheck %s --check-prefix=RV32
+
+; Test basic ct.select functionality for scalar types
+define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+; RV64-LABEL: test_ctselect_i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    ret
+  %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %result
+}
+
+define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+; RV64-LABEL: test_ctselect_i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    ret
+  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %result
+}
+
+define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    neg a3, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
+; RV64-LABEL: test_ctselect_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    neg a3, a0
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a1, a3, a1
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a1, a1, a3
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    xor a2, a2, a4
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a1, a1, a0
+; RV32-NEXT:    and a2, a2, a0
+; RV32-NEXT:    xor a0, a1, a3
+; RV32-NEXT:    xor a1, a2, a4
+; RV32-NEXT:    ret
+  %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+  ret i64 %result
+}
+
+define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
+; RV64-LABEL: test_ctselect_ptr:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    neg a3, a0
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a1, a3, a1
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_ptr:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    neg a3, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with constant conditions
+define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_const_true:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_const_true:
+; RV32:       # %bb.0:
+; RV32-NEXT:    ret
+  %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_const_false:
+; RV64:       # %bb.0:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_const_false:
+; RV32:       # %bb.0:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    ret
+  %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with comparison conditions
+define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_icmp_eq:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    xor a2, a2, a3
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a2, a0
+; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_icmp_eq:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a2, a0, a2
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    ret
+  %cond = icmp eq i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_icmp_ne:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    xor a2, a2, a3
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a2, a0
+; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_icmp_ne:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a2, a0, a2
+; RV32-NEXT:    not a0, a0
+; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    ret
+  %cond = icmp ne i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_icmp_slt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    slt a0, a0, a1
+; RV64-NEXT:    xor a2, a2, a3
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a0, a2, a0
+; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_icmp_slt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slt a0, a0, a1
+; RV32-NEXT:    neg a1, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+  %cond = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_icmp_ult:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a1, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    xor a2, a2, a3
+; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    and a0, a2, a0
+; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_icmp_ult:
+; RV32:       # %bb.0:
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    neg a1, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a1, a1, a2
+; RV32-NEXT:    and a0, a0, a3
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+  %cond = icmp ult i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with memory operands
+define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+; RV64-LABEL: test_ctselect_load:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    lw a2, 0(a2)
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_load:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a2, 0(a2)
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    neg a3, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a1, a3, a1
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+  %a = load i32, ptr %p1
+  %b = load i32, ptr %p2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test nested ctselect calls
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+; RV64-LABEL: test_ctselect_nested:
+; RV64:       # %bb.0:
+; RV64-NEXT:    xor a2, a2, a3
+; RV64-NEXT:    slli a1, a1, 63
+; RV64-NEXT:    xor a3, a3, a4
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a1, a1, 63
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    xor a1, a1, a3
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a0, a4
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_nested:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a1, a1, 1
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    neg a5, a1
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a2, a5, a2
+; RV32-NEXT:    neg a5, a0
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a1, a1, a3
+; RV32-NEXT:    or a1, a2, a1
+; RV32-NEXT:    and a1, a5, a1
+; RV32-NEXT:    and a0, a0, a4
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+  %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
+  ret i32 %result
+}
+
+; Declare the intrinsics
+declare i8 @llvm.ct.select.i8(i1, i8, i8)
+declare i16 @llvm.ct.select.i16(i1, i16, i16)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
new file mode 100644
index 0000000000000..095787a5e2a4b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect.ll
@@ -0,0 +1,779 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=X32-NOCMOV
+
+; Test basic ct.select functionality for scalar types
+
+define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+; X64-LABEL: test_ctselect_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    leal -1(%rdi), %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    negb %cl
+; X64-NEXT:    andb %sil, %cl
+; X64-NEXT:    andb %dl, %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i8:
+; X32:       # %bb.0:
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    negb %cl
+; X32-NEXT:    andb {{[0-9]+}}(%esp), %cl
+; X32-NEXT:    decb %al
+; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    orb %cl, %al
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_i8:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andb $1, %al
+; X32-NOCMOV-NEXT:    movl %eax, %ecx
+; X32-NOCMOV-NEXT:    negb %cl
+; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %cl
+; X32-NOCMOV-NEXT:    decb %al
+; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NOCMOV-NEXT:    orb %cl, %al
+; X32-NOCMOV-NEXT:    retl
+  %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %result
+}
+
+define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+; X64-LABEL: test_ctselect_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leal -1(%rdi), %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    andl %edx, %ecx
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i16:
+; X32:       # %bb.0:
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    leal -1(%eax), %ecx
+; X32-NEXT:    andw {{[0-9]+}}(%esp), %cx
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andw {{[0-9]+}}(%esp), %ax
+; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    # kill: def $ax killed $ax killed $eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_i16:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    leal -1(%eax), %ecx
+; X32-NOCMOV-NEXT:    andw {{[0-9]+}}(%esp), %cx
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andw {{[0-9]+}}(%esp), %ax
+; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    # kill: def $ax killed $ax killed $eax
+; X32-NOCMOV-NEXT:    retl
+  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %result
+}
+
+define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leal -1(%rdi), %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    andl %esi, %ecx
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i32:
+; X32:       # %bb.0:
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    negl %ecx
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    decl %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_i32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    movl %eax, %ecx
+; X32-NOCMOV-NEXT:    negl %ecx
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    decl %eax
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
+; X64-LABEL: test_ctselect_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leaq -1(%rdi), %rax
+; X64-NEXT:    negq %rdi
+; X64-NEXT:    andq %rsi, %rdi
+; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i64:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %edx, %eax
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    negl %esi
+; X32-NEXT:    andl %esi, %eax
+; X32-NEXT:    xorl %edx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    andl %esi, %edx
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_i64:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -8
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl %edx, %eax
+; X32-NOCMOV-NEXT:    andl $1, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    xorl %edx, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    andl %esi, %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+  ret i64 %result
+}
+
+define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
+; X64-LABEL: test_ctselect_f32:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    movl %edi, %edx
+; X64-NEXT:    negl %edx
+; X64-NEXT:    andl %ecx, %edx
+; X64-NEXT:    decl %edi
+; X64-NEXT:    andl %eax, %edi
+; X64-NEXT:    orl %edx, %edi
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f32:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    negl %ecx
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    decl %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_f32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    movl %eax, %ecx
+; X32-NOCMOV-NEXT:    negl %ecx
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    decl %eax
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    movl %eax, (%esp)
+; X32-NOCMOV-NEXT:    flds (%esp)
+; X32-NOCMOV-NEXT:    popl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
+; X64-LABEL: test_ctselect_f64:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movq %xmm1, %rax
+; X64-NEXT:    movq %xmm0, %rcx
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    movq %rdi, %rdx
+; X64-NEXT:    negq %rdx
+; X64-NEXT:    andq %rcx, %rdx
+; X64-NEXT:    decq %rdi
+; X64-NEXT:    andq %rax, %rdi
+; X64-NEXT:    orq %rdx, %rdi
+; X64-NEXT:    movq %rdi, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f64:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    xorl %edx, %esi
+; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    negl %ecx
+; X32-NEXT:    andl %ecx, %esi
+; X32-NEXT:    xorl %edx, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %eax, %edx
+; X32-NEXT:    andl %ecx, %edx
+; X32-NEXT:    xorl %eax, %edx
+; X32-NEXT:    movl %edx, (%esp)
+; X32-NEXT:    fldl (%esp)
+; X32-NEXT:    addl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_f64:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    subl $8, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -8
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    xorl %edx, %esi
+; X32-NOCMOV-NEXT:    andl $1, %ecx
+; X32-NOCMOV-NEXT:    negl %ecx
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    xorl %edx, %esi
+; X32-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %eax, %edx
+; X32-NOCMOV-NEXT:    andl %ecx, %edx
+; X32-NOCMOV-NEXT:    xorl %eax, %edx
+; X32-NOCMOV-NEXT:    movl %edx, (%esp)
+; X32-NOCMOV-NEXT:    fldl (%esp)
+; X32-NOCMOV-NEXT:    addl $8, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_ptr:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leaq -1(%rdi), %rax
+; X64-NEXT:    negq %rdi
+; X64-NEXT:    andq %rsi, %rdi
+; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_ptr:
+; X32:       # %bb.0:
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    negl %ecx
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    decl %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_ptr:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    movl %eax, %ecx
+; X32-NOCMOV-NEXT:    negl %ecx
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    decl %eax
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with constant conditions
+define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_const_true:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_const_true:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_const_true:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    retl
+  %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_const_false:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_const_false:
+; X32:       # %bb.0:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_const_false:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    xorl %eax, %eax
+; X32-NOCMOV-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    retl
+  %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with comparison conditions
+define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_eq:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    sete %al
+; X64-NEXT:    movl %eax, %esi
+; X64-NEXT:    negl %esi
+; X64-NEXT:    andl %edx, %esi
+; X64-NEXT:    decl %eax
+; X64-NEXT:    andl %ecx, %eax
+; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_icmp_eq:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    sete %al
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    negl %ecx
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    decl %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_icmp_eq:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    sete %al
+; X32-NOCMOV-NEXT:    movl %eax, %ecx
+; X32-NOCMOV-NEXT:    negl %ecx
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    decl %eax
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %cond = icmp eq i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_ne:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    setne %al
+; X64-NEXT:    movl %eax, %esi
+; X64-NEXT:    negl %esi
+; X64-NEXT:    andl %edx, %esi
+; X64-NEXT:    decl %eax
+; X64-NEXT:    andl %ecx, %eax
+; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_icmp_ne:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    setne %al
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    negl %ecx
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    decl %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_icmp_ne:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    setne %al
+; X32-NOCMOV-NEXT:    movl %eax, %ecx
+; X32-NOCMOV-NEXT:    negl %ecx
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    decl %eax
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %cond = icmp ne i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_slt:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    setl %al
+; X64-NEXT:    movl %eax, %esi
+; X64-NEXT:    negl %esi
+; X64-NEXT:    andl %edx, %esi
+; X64-NEXT:    decl %eax
+; X64-NEXT:    andl %ecx, %eax
+; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_icmp_slt:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    setl %al
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    negl %ecx
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    decl %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_icmp_slt:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    setl %al
+; X32-NOCMOV-NEXT:    movl %eax, %ecx
+; X32-NOCMOV-NEXT:    negl %ecx
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    decl %eax
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %cond = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_ult:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    andl %eax, %edx
+; X64-NEXT:    notl %eax
+; X64-NEXT:    andl %ecx, %eax
+; X64-NEXT:    orl %edx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_icmp_ult:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    sbbl %eax, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    andl %eax, %ecx
+; X32-NEXT:    notl %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_icmp_ult:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    sbbl %eax, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    andl %eax, %ecx
+; X32-NOCMOV-NEXT:    notl %eax
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %cond = icmp ult i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
+; X64-LABEL: test_ctselect_fcmp_oeq:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm3, %eax
+; X64-NEXT:    cmpeqss %xmm1, %xmm0
+; X64-NEXT:    movd %xmm0, %ecx
+; X64-NEXT:    pand %xmm2, %xmm0
+; X64-NEXT:    movd %xmm0, %edx
+; X64-NEXT:    notl %ecx
+; X64-NEXT:    andl %eax, %ecx
+; X64-NEXT:    orl %edx, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_fcmp_oeq:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NEXT:    fucompi %st(1), %st
+; X32-NEXT:    fstp %st(0)
+; X32-NEXT:    setnp %al
+; X32-NEXT:    sete %cl
+; X32-NEXT:    andb %al, %cl
+; X32-NEXT:    movzbl %cl, %eax
+; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    negl %ecx
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    decl %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    fucompp
+; X32-NOCMOV-NEXT:    fnstsw %ax
+; X32-NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
+; X32-NOCMOV-NEXT:    sahf
+; X32-NOCMOV-NEXT:    setnp %al
+; X32-NOCMOV-NEXT:    sete %cl
+; X32-NOCMOV-NEXT:    andb %al, %cl
+; X32-NOCMOV-NEXT:    movzbl %cl, %eax
+; X32-NOCMOV-NEXT:    movl %eax, %ecx
+; X32-NOCMOV-NEXT:    negl %ecx
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    decl %eax
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    movl %eax, (%esp)
+; X32-NOCMOV-NEXT:    flds (%esp)
+; X32-NOCMOV-NEXT:    popl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %cond = fcmp oeq float %x, %y
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test with memory operands
+define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_load:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leal -1(%rdi), %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    andl (%rsi), %ecx
+; X64-NEXT:    andl (%rdx), %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_load:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    movl %eax, %esi
+; X32-NEXT:    negl %esi
+; X32-NEXT:    andl (%edx), %esi
+; X32-NEXT:    decl %eax
+; X32-NEXT:    andl (%ecx), %eax
+; X32-NEXT:    orl %esi, %eax
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_load:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    movl %eax, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    andl (%edx), %esi
+; X32-NOCMOV-NEXT:    decl %eax
+; X32-NOCMOV-NEXT:    andl (%ecx), %eax
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %a = load i32, ptr %p1
+  %b = load i32, ptr %p2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test nested ctselect calls
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+; X64-LABEL: test_ctselect_nested:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %esi
+; X64-NEXT:    leal -1(%rsi), %r9d
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    andl %ecx, %r9d
+; X64-NEXT:    orl %eax, %r9d
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leal -1(%rdi), %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    negl %ecx
+; X64-NEXT:    andl %r9d, %ecx
+; X64-NEXT:    andl %r8d, %eax
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_nested:
+; X32:       # %bb.0:
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    andl $1, %ecx
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    negl %edx
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    decl %ecx
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    orl %edx, %ecx
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    movl %eax, %edx
+; X32-NEXT:    negl %edx
+; X32-NEXT:    andl %ecx, %edx
+; X32-NEXT:    decl %eax
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    orl %edx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_nested:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    andl $1, %ecx
+; X32-NOCMOV-NEXT:    movl %ecx, %edx
+; X32-NOCMOV-NEXT:    negl %edx
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    decl %ecx
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    orl %edx, %ecx
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    movl %eax, %edx
+; X32-NOCMOV-NEXT:    negl %edx
+; X32-NOCMOV-NEXT:    andl %ecx, %edx
+; X32-NOCMOV-NEXT:    decl %eax
+; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    orl %edx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
+  ret i32 %result
+}
+
+; Declare the intrinsics
+declare i8 @llvm.ct.select.i8(i1, i8, i8)
+declare i16 @llvm.ct.select.i16(i1, i16, i16)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)

>From 3332192f36520a7a76c86343f2c65e40a7abb59e Mon Sep 17 00:00:00 2001
From: Akshay K <iit.akshay at gmail.com>
Date: Tue, 10 Feb 2026 09:43:36 -0500
Subject: [PATCH 2/6] [LLVM][CodeGen]  Improve CTSELECT fallback lowering and
 target support modeling (#179395)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This pull request refactors and improves the **fallback handling** of
constant-time select (CTSELECT) in LLVM’s code generation
infrastructure. The changes clarify semantics, simplify
target-capability checks, and improve the correctness and
maintainability of fallback lowering, without changing the intended
constant-time guarantees.

### Summary of Changes

- **CTSELECT semantics**
- Clarified documentation for the `CTSELECT` node to explicitly describe
its operands and its role as the lowering target for the constant-time
select intrinsic.

- **TargetLowering cleanup**
  - Removed CTSELECT-specific entries from `SelectSupportKind`.
- Introduced a dedicated `isCtSelectSupported(EVT)` hook to cleanly
separate ISA select support from constant-time guarantees.
  - Updated intrinsic lowering to use this new capability check.

- **Fallback implementation improvements**
- Updated the generic SelectionDAG fallback lowering to use the
canonical bitwise formulation:
    ```
    F ^ ((T ^ F) & Mask)
    ```
- Simplified handling of vector splats and bitcasting in the fallback
path.

- **DAGCombiner refactor**
- Renamed and clarified CTSELECT-related DAGCombiner code paths for
improved readability.

- **Miscellaneous cleanups**
  - Improved documentation headers for constant-time intrinsics.
  - Removed unused flags and minor nits uncovered during review.

These changes primarily affect the **generic fallback path** used when
targets do not provide specialized CTSELECT lowering. Target-specific
implementations are handled in follow-up PRs
---
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  4 +-
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h |  2 -
 llvm/include/llvm/CodeGen/TargetLowering.h    | 11 ++--
 llvm/include/llvm/IR/Intrinsics.td            |  3 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  8 +--
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 61 ++++++-------------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 +
 llvm/lib/Target/ARM/ARMISelLowering.h         |  2 +
 llvm/lib/Target/X86/X86ISelLowering.h         |  2 +
 9 files changed, 35 insertions(+), 60 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index e843a78549315..84f5a62cfeb08 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -783,8 +783,8 @@ enum NodeType {
   /// i1 then the high bits must conform to getBooleanContents.
   SELECT,
 
-  /// Constant-time Select, implemented with CMOV instruction. This is used to
-  /// implement constant-time select.
+  /// CTSELECT(Cond, TrueVal, FalseVal). Cond is i1 and the value operands must
+  /// have the same type. Used to lower the constant-time select intrinsic.
   CTSELECT,
 
   /// Select with a vector condition (op #0) and two vector operands (ops #1
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 8a9e3335b2453..fdb76a93bc5bb 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -436,8 +436,6 @@ struct SDNodeFlags {
     FastMathFlags = NoNaNs | NoInfs | NoSignedZeros | AllowReciprocal |
                     AllowContract | ApproximateFuncs | AllowReassociation,
 
-    // Flag for disabling optimization
-    NoMerge = 1 << 15,
   };
 
   /// Default constructor turns off all optimization flags.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index c1714c3cc073f..084e08e76bd2e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -247,10 +247,6 @@ class LLVM_ABI TargetLoweringBase {
                          // and vector values (ex: cmov).
     VectorMaskSelect,    // The target supports vector selects with a vector
                          // mask (ex: x86 blends).
-    CtSelect,            // The target implements a custom constant-time select.
-    ScalarCondVectorValCtSelect, // The target supports selects with a scalar
-                                 // condition and vector values.
-    VectorMaskValCtSelect, // The target supports vector selects with a vector
   };
 
   /// Enum that specifies what an atomic load/AtomicRMWInst is expanded
@@ -481,9 +477,10 @@ class LLVM_ABI TargetLoweringBase {
   MachineMemOperand::Flags
   getVPIntrinsicMemOperandFlags(const VPIntrinsic &VPIntrin) const;
 
-  virtual bool isSelectSupported(SelectSupportKind kind) const {
-    return kind != CtSelect;
-  }
+  virtual bool isSelectSupported(SelectSupportKind kind) const { return true; }
+
+  /// Return true if the target has custom lowering for constant-time select.
+  virtual bool isCtSelectSupported(EVT VT) const { return false; }
 
   /// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
   /// using generic code in SelectionDAGBuilder.
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 4fe2736b78023..abfa5bc279918 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1863,8 +1863,7 @@ def int_coro_subfn_addr : DefaultAttrsIntrinsic<
     [IntrReadMem, IntrArgMemOnly, ReadOnly<ArgIndex<0>>,
      NoCapture<ArgIndex<0>>]>;
 
-///===-------------------------- Constant Time Intrinsics
-///--------------------------===//
+///===---------------------- Constant Time Intrinsics ----------------------===//
 //
 // Intrinsic to support constant time select
 def int_ct_select
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7cec739015756..646bc5e78c051 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -484,7 +484,8 @@ namespace {
     SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
     SDValue visitCTPOP(SDNode *N);
     SDValue visitSELECT(SDNode *N);
-    SDValue visitCTSELECT(SDNode *N);
+    // visit CTSELECT Node
+    SDValue visitConstantTimeSelect(SDNode *N);
     SDValue visitVSELECT(SDNode *N);
     SDValue visitVP_SELECT(SDNode *N);
     SDValue visitSELECT_CC(SDNode *N);
@@ -1905,7 +1906,6 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
 }
 
 SDValue DAGCombiner::visit(SDNode *N) {
-
   // clang-format off
   switch (N->getOpcode()) {
   default: break;
@@ -1976,7 +1976,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::CTTZ_ZERO_UNDEF:    return visitCTTZ_ZERO_UNDEF(N);
   case ISD::CTPOP:              return visitCTPOP(N);
   case ISD::SELECT:             return visitSELECT(N);
-  case ISD::CTSELECT:           return visitCTSELECT(N);
+  case ISD::CTSELECT:           return visitConstantTimeSelect(N);
   case ISD::VSELECT:            return visitVSELECT(N);
   case ISD::SELECT_CC:          return visitSELECT_CC(N);
   case ISD::SETCC:              return visitSETCC(N);
@@ -12583,7 +12583,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitCTSELECT(SDNode *N) {
+SDValue DAGCombiner::visitConstantTimeSelect(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index eaecb87979648..49e38f56d30ad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6582,7 +6582,10 @@ void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I,
 
 /// Fallback implementation for constant-time select using DAG chaining.
 /// This implementation uses data dependencies through virtual registers to
-/// prevent optimizations from breaking the constant-time property.
+/// prevent optimizations from breaking the constant-time property. It is a
+/// best-effort safeguard; for stronger guarantees we prefer target-specific
+/// lowering pipelines that preserve the select pattern by construction.
+///
 /// It handles scalars, vectors (fixed and scalable), and floating-point types.
 SDValue SelectionDAGBuilder::createProtectedCtSelectFallback(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Cond, SDValue T, SDValue F,
@@ -6599,28 +6602,12 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback(
   if (VT.isVector() && !Cond.getValueType().isVector()) {
     ElementCount NumElems = VT.getVectorElementCount();
     EVT CondVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElems);
-
-    if (VT.isScalableVector()) {
-      Cond = DAG.getSplatVector(CondVT, DL, Cond);
-    } else {
-      Cond = DAG.getSplatBuildVector(CondVT, DL, Cond);
-    }
+    Cond = DAG.getSplat(CondVT, DL, Cond);
   }
 
   // Handle floating-point types: bitcast to integer for bitwise operations
   if (VT.isFloatingPoint()) {
-    if (VT.isVector()) {
-      // float vector -> int vector
-      EVT ElemVT = VT.getVectorElementType();
-      unsigned int ElemBitWidth = ElemVT.getScalarSizeInBits();
-      EVT IntElemVT = EVT::getIntegerVT(*DAG.getContext(), ElemBitWidth);
-
-      WorkingVT = EVT::getVectorVT(*DAG.getContext(), IntElemVT,
-                                   VT.getVectorElementCount());
-    } else {
-      WorkingVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
-    }
-
+    WorkingVT = VT.changeTypeToInteger();
     WorkingT = DAG.getBitcast(WorkingVT, T);
     WorkingF = DAG.getBitcast(WorkingVT, F);
   }
@@ -6628,24 +6615,10 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback(
   // Create mask: sign-extend condition to all bits
   SDValue Mask = DAG.getSExtOrTrunc(Cond, DL, WorkingVT);
 
-  // Create all-ones constant for inversion
-  SDValue AllOnes;
-  if (WorkingVT.isScalableVector()) {
-    unsigned BitWidth = WorkingVT.getScalarSizeInBits();
-    APInt AllOnesVal = APInt::getAllOnes(BitWidth);
-    SDValue ScalarAllOnes =
-        DAG.getConstant(AllOnesVal, DL, WorkingVT.getScalarType());
-    AllOnes = DAG.getSplatVector(WorkingVT, DL, ScalarAllOnes);
-  } else {
-    AllOnes = DAG.getAllOnesConstant(DL, WorkingVT);
-  }
-
-  // Invert mask for false value
-  SDValue Invert = DAG.getNode(ISD::XOR, DL, WorkingVT, Mask, AllOnes);
-
-  // Compute: (T & Mask) | (F & ~Mask)
+  // Compute: F ^ ((T ^ F) & Mask)
   // This is constant-time because both branches are always computed
-  SDValue TM = DAG.getNode(ISD::AND, DL, WorkingVT, Mask, WorkingT);
+  SDValue XorTF = DAG.getNode(ISD::XOR, DL, WorkingVT, WorkingT, WorkingF);
+  SDValue TM = DAG.getNode(ISD::AND, DL, WorkingVT, XorTF, Mask);
 
   // DAG chaining: create data dependency through virtual register
   // This prevents optimizations from reordering or eliminating operations
@@ -6653,10 +6626,15 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback(
   bool CanUseChaining = false;
 
   if (!WorkingVT.isScalableVector()) {
-    // For fixed-size vectors and scalars, check if type is legal
+    // For fixed-size vectors and scalars, chaining is a best-effort hardening
+    // step. The CT guarantee comes from the dataflow-only select
+    // pattern (both sides computed, no control-flow). Chaining only adds an
+    // extra dependency to discourage later combines.
     CanUseChaining = TLI.isTypeLegal(WorkingVT.getSimpleVT());
   } else {
-    // For scalable vectors, disable chaining (conservative approach)
+    // For scalable vectors, skip chaining because there is no stable register
+    // class to copy through. CT behavior still relies on the masking/select
+    // pattern above.
     CanUseChaining = false;
   }
 
@@ -6668,8 +6646,7 @@ SDValue SelectionDAGBuilder::createProtectedCtSelectFallback(
     TM = DAG.getCopyFromReg(Chain, DL, TMReg, WorkingVT);
   }
 
-  SDValue FM = DAG.getNode(ISD::AND, DL, WorkingVT, Invert, WorkingF);
-  SDValue Result = DAG.getNode(ISD::OR, DL, WorkingVT, TM, FM);
+  SDValue Result = DAG.getNode(ISD::XOR, DL, WorkingVT, WorkingF, TM);
 
   // Convert back to original type if needed
   if (WorkingVT != VT) {
@@ -6878,9 +6855,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     assert(!CondVT.isVector() && "Vector type cond not supported yet");
 
     // Handle scalar types
-    if (TLI.isSelectSupported(
-            TargetLoweringBase::SelectSupportKind::CtSelect) &&
-        !CondVT.isVector()) {
+    if (TLI.isCtSelectSupported(VT) && !CondVT.isVector()) {
       SDValue Result = DAG.getNode(ISD::CTSELECT, DL, VT, Cond, A, B);
       setValue(&I, Result);
       return;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e8c026d989eb8..292caf16a97e2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -157,6 +157,8 @@ class AArch64TargetLowering : public TargetLowering {
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
 
+  bool isCtSelectSupported(EVT VT) const override { return false; }
+
   SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
   MachineBasicBlock *EmitF128CSEL(MachineInstr &MI,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index d0fb58c764edd..faa12aba61ed3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -119,6 +119,8 @@ class VectorType;
       return (Kind != ScalarCondVectorVal);
     }
 
+    bool isCtSelectSupported(EVT VT) const override { return false; }
+
     bool isReadOnly(const GlobalValue *GV) const;
 
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 848fe4bf86d2c..4100ae9925781 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1078,6 +1078,8 @@ namespace llvm {
     unsigned getJumpTableEncoding() const override;
     bool useSoftFloat() const override;
 
+    bool isCtSelectSupported(EVT VT) const override { return false; }
+
     void markLibCallAttributes(MachineFunction *MF, unsigned CC,
                                ArgListTy &Args) const override;
 

>From f4b39b4d2e0e5b4ed04e130fdd2330dc81da1c8e Mon Sep 17 00:00:00 2001
From: Akshay K <iit.akshay at gmail.com>
Date: Wed, 11 Feb 2026 14:08:35 -0500
Subject: [PATCH 3/6] [LLVM][ConstantTime] Strengthen constant-time handling of
 CTSELECT in DAG combine and legalization (#180883)

This PR improves the handling of CTSELECT in SelectionDAG to strengthen
constant-time guarantees and ensure correctness during DAG combining and
type legalization.

Key changes:

- Introduce a dedicated visitConstantTimeSelect routine that performs
only CT-safe optimizations (e.g., negated condition canonicalization and
i1 AND/OR nesting merges) while avoiding unsafe generic rewrites.
- Remove aggressive generic combinations that could violate
constant-time intent.
- Add specialized type legalization routines (SplitRes_CTSELECT,
PromoteFloatRes_CTSELECT, SoftPromoteHalfRes_CTSELECT) to preserve
constant-time semantics during result splitting and promotion.
- Handling of vector expansion with constant-time bitwise blend
lowering.
- Remove redundant X86/RISCV tests and add vector coverage while
maintaining critical code path validation.

These changes improve correctness, maintainability, and the end-to-end
constant-time behavior of the fallback implementation across the
optimization and legalization stages.
---
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h |    2 +-
 llvm/include/llvm/CodeGen/TargetLowering.h    |    2 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  105 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |   56 +-
 .../SelectionDAG/LegalizeFloatTypes.cpp       |   18 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |    2 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |    2 +
 .../SelectionDAG/LegalizeTypesGeneric.cpp     |   14 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |    1 -
 llvm/test/CodeGen/RISCV/ctselect-fallback.ll  |  698 +++++++--
 llvm/test/CodeGen/X86/ctselect.ll             | 1268 ++++++++++++-----
 11 files changed, 1568 insertions(+), 600 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index fdb76a93bc5bb..aa72e81b2ab54 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -435,7 +435,6 @@ struct SDNodeFlags {
                             NonNeg | NoNaNs | NoInfs | SameSign | InBounds,
     FastMathFlags = NoNaNs | NoInfs | NoSignedZeros | AllowReciprocal |
                     AllowContract | ApproximateFuncs | AllowReassociation,
-
   };
 
   /// Default constructor turns off all optimization flags.
@@ -487,6 +486,7 @@ struct SDNodeFlags {
   bool hasNoFPExcept() const { return Flags & NoFPExcept; }
   bool hasUnpredictable() const { return Flags & Unpredictable; }
   bool hasInBounds() const { return Flags & InBounds; }
+
   bool operator==(const SDNodeFlags &Other) const {
     return Flags == Other.Flags;
   }
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 084e08e76bd2e..724a69bd26861 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -245,7 +245,7 @@ class LLVM_ABI TargetLoweringBase {
     ScalarValSelect,     // The target supports scalar selects (ex: cmov).
     ScalarCondVectorVal, // The target supports selects with a scalar condition
                          // and vector values (ex: cmov).
-    VectorMaskSelect,    // The target supports vector selects with a vector
+    VectorMaskSelect     // The target supports vector selects with a vector
                          // mask (ex: x86 blends).
   };
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 646bc5e78c051..ae1fa6e2cc523 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6050,7 +6050,6 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
     N0CC = cast<CondCodeSDNode>(N0.getOperand(4))->get();
     break;
   case ISD::SELECT:
-  case ISD::CTSELECT:
   case ISD::VSELECT:
     if (N0.getOperand(0).getOpcode() != ISD::SETCC)
       return SDValue();
@@ -12219,8 +12218,7 @@ template <class MatchContextClass>
 static SDValue foldBoolSelectToLogic(SDNode *N, const SDLoc &DL,
                                      SelectionDAG &DAG) {
   assert((N->getOpcode() == ISD::SELECT || N->getOpcode() == ISD::VSELECT ||
-          N->getOpcode() == ISD::VP_SELECT ||
-          N->getOpcode() == ISD::CTSELECT) &&
+          N->getOpcode() == ISD::VP_SELECT) &&
          "Expected a (v)(vp.)(ct) select");
   SDValue Cond = N->getOperand(0);
   SDValue T = N->getOperand(1), F = N->getOperand(2);
@@ -12583,6 +12581,12 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   return SDValue();
 }
 
+// Keep CTSELECT combines deliberately conservative to preserve constant-time
+// intent across generic DAG combines. We only accept:
+//  - canonicalization of negated conditions (flip true/false operands), and
+//  - i1 CTSELECT nesting merges via AND/OR that keep the result as CTSELECT.
+// Broader rewrites should be done in target-specific lowering when stronger
+// guarantees about legality and constant-time preservation are available.
 SDValue DAGCombiner::visitConstantTimeSelect(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12592,10 +12596,10 @@ SDValue DAGCombiner::visitConstantTimeSelect(SDNode *N) {
   SDLoc DL(N);
   SDNodeFlags Flags = N->getFlags();
 
-  if (SDValue V = foldBoolSelectToLogic<EmptyMatchContext>(N, DL, DAG))
-    return V;
-
   // ctselect (not Cond), N1, N2 -> ctselect Cond, N2, N1
+  // This is a CT-safe canonicalization: flip negated condition by swapping
+  // arms. extractBooleanFlip only matches boolean xor-with-1, so this preserves
+  // dataflow semantics and does not introduce data-dependent control flow.
   if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) {
     SDValue SelectOp = DAG.getNode(ISD::CTSELECT, DL, VT, F, N2, N1);
     SelectOp->setFlags(Flags);
@@ -12603,82 +12607,43 @@ SDValue DAGCombiner::visitConstantTimeSelect(SDNode *N) {
   }
 
   if (VT0 == MVT::i1) {
-    // The code in this block deals with the following 2 equivalences:
-    //    select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y))
-    //    select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
-    // The target can specify its preferred form with the
-    // shouldNormalizeToSelectSequence() callback. However we always transform
-    // to the right anyway if we find the inner select exists in the DAG anyway
-    // and we always transform to the left side if we know that we can further
-    // optimize the combination of the conditions.
-    bool normalizeToSequence =
-        TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
-    // ctselect (and Cond0, Cond1), X, Y
-    //   -> ctselect Cond0, (ctselect Cond1, X, Y), Y
-    if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
-      SDValue Cond0 = N0->getOperand(0);
-      SDValue Cond1 = N0->getOperand(1);
-      SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(),
-                                        Cond1, N1, N2, Flags);
-      if (normalizeToSequence || !InnerSelect.use_empty())
-        return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0,
-                           InnerSelect, N2, Flags);
-      // Cleanup on failure.
-      if (InnerSelect.use_empty())
-        recursivelyDeleteUnusedNodes(InnerSelect.getNode());
-    }
-    // ctselect (or Cond0, Cond1), X, Y -> ctselect Cond0, X, (ctselect Cond1,
-    // X, Y)
-    if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
-      SDValue Cond0 = N0->getOperand(0);
-      SDValue Cond1 = N0->getOperand(1);
-      SDValue InnerSelect = DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(),
-                                        Cond1, N1, N2, Flags);
-      if (normalizeToSequence || !InnerSelect.use_empty())
-        return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Cond0, N1,
-                           InnerSelect, Flags);
-      // Cleanup on failure.
-      if (InnerSelect.use_empty())
-        recursivelyDeleteUnusedNodes(InnerSelect.getNode());
-    }
-
-    // ctselect Cond0, (ctselect Cond1, X, Y), Y -> ctselect (and Cond0, Cond1),
-    // X, Y
+    // Nested CTSELECT merging optimizations for i1 conditions.
+    // These are CT-safe because:
+    //   1. AND/OR are bitwise operations that execute in constant time
+    //   2. The optimization combines two sequential CTSELECTs into one,
+    //   reducing the total number of constant-time operations without
+    //   changing semantics
+    //   3. No data-dependent branches or memory accesses are introduced
+    //
+    // ctselect C0, (ctselect C1, X, Y), Y -> ctselect (C0 & C1), X, Y
+    // Semantic equivalence: If C0 is true, evaluate inner select (C1 ? X :
+    // Y). If C0 is false, choose Y. This is equivalent to (C0 && C1) ? X : Y.
     if (N1->getOpcode() == ISD::CTSELECT && N1->hasOneUse()) {
       SDValue N1_0 = N1->getOperand(0);
       SDValue N1_1 = N1->getOperand(1);
       SDValue N1_2 = N1->getOperand(2);
       if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
-        // Create the actual and node if we can generate good code for it.
-        if (!normalizeToSequence) {
-          SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
-          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), And, N1_1,
-                             N2, Flags);
-        }
-        // Otherwise see if we can optimize the "and" to a better pattern.
-        if (SDValue Combined = visitANDLike(N0, N1_0, N)) {
-          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined,
-                             N1_1, N2, Flags);
-        }
+        SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
+        SDValue SelectOp =
+            DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), And, N1_1, N2);
+        SelectOp->setFlags(Flags);
+        return SelectOp;
       }
     }
-    // ctselect Cond0, X, (ctselect Cond1, X, Y) -> ctselect (or Cond0, Cond1),
-    // X, Y
+
+    // ctselect C0, X, (ctselect C1, X, Y) -> ctselect (C0 | C1), X, Y
+    // Semantic equivalence: If C0 is true, choose X. If C0 is false, evaluate
+    // inner select (C1 ? X : Y). This is equivalent to (C0 || C1) ? X : Y.
     if (N2->getOpcode() == ISD::CTSELECT && N2->hasOneUse()) {
       SDValue N2_0 = N2->getOperand(0);
       SDValue N2_1 = N2->getOperand(1);
       SDValue N2_2 = N2->getOperand(2);
       if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
-        // Create the actual or node if we can generate good code for it.
-        if (!normalizeToSequence) {
-          SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
-          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Or, N1, N2_2,
-                             Flags);
-        }
-        // Otherwise see if we can optimize to a better pattern.
-        if (SDValue Combined = visitORLike(N0, N2_0, DL))
-          return DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Combined, N1,
-                             N2_2, Flags);
+        SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
+        SDValue SelectOp =
+            DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Or, N1, N2_2);
+        SelectOp->setFlags(Flags);
+        return SelectOp;
       }
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ae5a47fa74844..f9c0456651c07 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4249,19 +4249,51 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp3 = Node->getOperand(2);
     EVT VT = Tmp2.getValueType();
     if (VT.isVector()) {
-      SmallVector<SDValue> Elements;
-      unsigned NumElements = VT.getVectorNumElements();
-      EVT ScalarVT = VT.getScalarType();
-      for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
-        SDValue IdxVal = DAG.getConstant(Idx, dl, MVT::i64);
-        SDValue TVal =
-            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp2, IdxVal);
-        SDValue FVal =
-            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Tmp3, IdxVal);
-        Elements.push_back(
-            DAG.getCTSelect(dl, ScalarVT, Tmp1, TVal, FVal, Node->getFlags()));
+      // Constant-time vector blending using pattern F ^ ((T ^ F) & Mask)
+      // where Mask = broadcast(i1 ? -1 : 0) to match vector element width.
+      //
+      // This formulation uses only XOR and AND operations, avoiding branches
+      // that would leak timing information. It's equivalent to:
+      //   Mask==0xFF: F ^ ((T ^ F) & 0xFF) = F ^ (T ^ F) = T
+      //   Mask==0x00: F ^ ((T ^ F) & 0x00) = F ^ 0 = F
+
+      EVT IntVT = VT;
+      SDValue T = Tmp2; // True value
+      SDValue F = Tmp3; // False value
+
+      // Step 1: Handle floating-point vectors by bitcasting to integer
+      if (VT.isFloatingPoint()) {
+        IntVT = EVT::getVectorVT(
+            *DAG.getContext(),
+            EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits()),
+            VT.getVectorElementCount());
+        T = DAG.getNode(ISD::BITCAST, dl, IntVT, T);
+        F = DAG.getNode(ISD::BITCAST, dl, IntVT, F);
       }
-      Tmp1 = DAG.getBuildVector(VT, dl, Elements);
+
+      // Step 2: Broadcast the i1 condition to a vector of i1s
+      // Creates [cond, cond, cond, ...] with i1 elements
+      EVT VecI1Ty = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                     VT.getVectorNumElements());
+      SDValue VecCond = DAG.getSplatBuildVector(VecI1Ty, dl, Tmp1);
+
+      // Step 3: Sign-extend i1 vector to get all-bits mask
+      // true (i1=1) -> 0xFFFFFFFF..., false (i1=0) -> 0x00000000
+      // Sign extension is constant-time: pure arithmetic, no branches
+      SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, IntVT, VecCond);
+
+      // Step 4: Compute constant-time blend: F ^ ((T ^ F) & Mask)
+      // All operations (XOR, AND) execute in constant time
+      SDValue TXorF = DAG.getNode(ISD::XOR, dl, IntVT, T, F);
+      SDValue MaskedDiff = DAG.getNode(ISD::AND, dl, IntVT, TXorF, Mask);
+      Tmp1 = DAG.getNode(ISD::XOR, dl, IntVT, F, MaskedDiff);
+
+      // Step 5: Bitcast back to original floating-point type if needed
+      if (VT.isFloatingPoint()) {
+        Tmp1 = DAG.getNode(ISD::BITCAST, dl, VT, Tmp1);
+      }
+
+      Tmp1->setFlags(Node->getFlags());
     } else if (VT.isFloatingPoint()) {
       EVT IntegerVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
       Tmp2 = DAG.getBitcast(IntegerVT, Tmp2);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 6be66663e5c74..1f2d50e8af8d5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1572,7 +1572,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::POISON:
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
-  case ISD::CTSELECT:     SplitRes_Select(N, Lo, Hi); break;
+  case ISD::CTSELECT:     SplitRes_CTSELECT(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
 
   case ISD::MERGE_VALUES:       ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
@@ -2930,7 +2930,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
       break;
     case ISD::SELECT:     R = PromoteFloatRes_SELECT(N); break;
     case ISD::CTSELECT:
-      R = PromoteFloatRes_SELECT(N);
+      R = PromoteFloatRes_CTSELECT(N);
       break;
     case ISD::SELECT_CC:  R = PromoteFloatRes_SELECT_CC(N); break;
 
@@ -3238,6 +3238,11 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT(SDNode *N) {
                      N->getOperand(0), TrueVal, FalseVal);
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatRes_CTSELECT(SDNode *N) {
+  // Keep CTSELECT behavior aligned with SELECT promotion logic.
+  return PromoteFloatRes_SELECT(N);
+}
+
 // Construct a new SELECT_CC node with the promoted true- and false- values.
 // The operands used for comparison are promoted by PromoteFloatOp_SELECT_CC.
 SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT_CC(SDNode *N) {
@@ -3419,7 +3424,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
     break;
   case ISD::SELECT:      R = SoftPromoteHalfRes_SELECT(N); break;
   case ISD::CTSELECT:
-    R = SoftPromoteHalfRes_SELECT(N);
+    R = SoftPromoteHalfRes_CTSELECT(N);
     break;
   case ISD::SELECT_CC:   R = SoftPromoteHalfRes_SELECT_CC(N); break;
   case ISD::STRICT_SINT_TO_FP:
@@ -3665,6 +3670,13 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT(SDNode *N) {
                        Op2);
 }
 
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_CTSELECT(SDNode *N) {
+  SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
+  SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+  return DAG.getCTSelect(SDLoc(N), Op1.getValueType(), N->getOperand(0), Op1,
+                         Op2);
+}
+
 SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT_CC(SDNode *N) {
   SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
   SDValue Op3 = GetSoftPromotedHalf(N->getOperand(3));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index ac20f8a009600..2318ed7834dc0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -3006,7 +3006,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
   case ISD::CTSELECT:
-    SplitRes_Select(N, Lo, Hi);
+    SplitRes_CTSELECT(N, Lo, Hi);
     break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::POISON:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 62069b4fb03a3..33784418db499 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -784,6 +784,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteFloatRes_LOAD(SDNode *N);
   SDValue PromoteFloatRes_ATOMIC_LOAD(SDNode *N);
   SDValue PromoteFloatRes_SELECT(SDNode *N);
+  SDValue PromoteFloatRes_CTSELECT(SDNode *N);
   SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
   SDValue PromoteFloatRes_UnaryOp(SDNode *N);
   SDValue PromoteFloatRes_AssertNoFPClass(SDNode *N);
@@ -834,6 +835,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftPromoteHalfRes_LOAD(SDNode *N);
   SDValue SoftPromoteHalfRes_ATOMIC_LOAD(SDNode *N);
   SDValue SoftPromoteHalfRes_SELECT(SDNode *N);
+  SDValue SoftPromoteHalfRes_CTSELECT(SDNode *N);
   SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N);
   SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N);
   SDValue SoftPromoteHalfRes_FABS(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 098368ef2f6b3..a7187f3ae2bc7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -571,17 +571,9 @@ void DAGTypeLegalizer::SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi) {
 }
 
 void DAGTypeLegalizer::SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
-  SDValue LL, LH, RL, RH, CL, CH;
-  SDLoc dl(N);
-  GetSplitOp(N->getOperand(1), LL, LH);
-  GetSplitOp(N->getOperand(2), RL, RH);
-
-  SDValue Cond = N->getOperand(0);
-  CL = CH = Cond;
-  assert(!Cond.getValueType().isVector() && "Unsupported vector type");
-
-  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL);
-  Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH);
+  // Reuse generic select splitting to support scalar and vector conditions.
+  // SplitRes_Select rebuilds with N->getOpcode(), so CTSELECT is preserved.
+  SplitRes_Select(N, Lo, Hi);
 }
 
 void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 6eee61da2e0d1..b7e78b94c0687 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8179,7 +8179,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       return V;
     break;
   }
-
   case ISD::SELECT:
   case ISD::VSELECT:
     if (SDValue V = simplifySelect(N1, N2, N3))
diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
index f46bde0a05b8b..c624c17d7e33e 100644
--- a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
+++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
@@ -10,7 +10,7 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
 ; RV64-NEXT:    slli a0, a0, 63
 ; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    xor a0, a2, a0
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_ctselect_i8:
@@ -19,52 +19,28 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
 ; RV32-NEXT:    slli a0, a0, 31
 ; RV32-NEXT:    srai a0, a0, 31
 ; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    xor a0, a2, a0
 ; RV32-NEXT:    ret
   %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
   ret i8 %result
 }
-
-define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
-; RV64-LABEL: test_ctselect_i16:
+define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_i32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    xor a1, a1, a2
 ; RV64-NEXT:    slli a0, a0, 63
 ; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    xor a0, a2, a0
 ; RV64-NEXT:    ret
 ;
-; RV32-LABEL: test_ctselect_i16:
+; RV32-LABEL: test_ctselect_i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    xor a1, a1, a2
 ; RV32-NEXT:    slli a0, a0, 31
 ; RV32-NEXT:    srai a0, a0, 31
 ; RV32-NEXT:    and a0, a1, a0
-; RV32-NEXT:    xor a0, a0, a2
-; RV32-NEXT:    ret
-  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
-  ret i16 %result
-}
-
-define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
-; RV64-LABEL: test_ctselect_i32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    xor a1, a1, a2
-; RV64-NEXT:    slli a0, a0, 63
-; RV64-NEXT:    srai a0, a0, 63
-; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    xor a0, a0, a2
-; RV64-NEXT:    ret
-;
-; RV32-LABEL: test_ctselect_i32:
-; RV32:       # %bb.0:
-; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    neg a3, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a1, a3, a1
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    xor a0, a2, a0
 ; RV32-NEXT:    ret
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
@@ -73,12 +49,11 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
 define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 ; RV64-LABEL: test_ctselect_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, 1
-; RV64-NEXT:    neg a3, a0
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a1, a3, a1
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_ctselect_i64:
@@ -89,8 +64,8 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 ; RV32-NEXT:    srai a0, a0, 31
 ; RV32-NEXT:    and a1, a1, a0
 ; RV32-NEXT:    and a2, a2, a0
-; RV32-NEXT:    xor a0, a1, a3
-; RV32-NEXT:    xor a1, a2, a4
+; RV32-NEXT:    xor a0, a3, a1
+; RV32-NEXT:    xor a1, a4, a2
 ; RV32-NEXT:    ret
   %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
   ret i64 %result
@@ -99,22 +74,20 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
 ; RV64-LABEL: test_ctselect_ptr:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    andi a0, a0, 1
-; RV64-NEXT:    neg a3, a0
-; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a1, a3, a1
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_ctselect_ptr:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    neg a3, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a1, a3, a1
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a0, a2, a0
 ; RV32-NEXT:    ret
   %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
   ret ptr %result
@@ -128,6 +101,8 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
 ;
 ; RV32-LABEL: test_ctselect_const_true:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    ret
   %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
   ret i32 %result
@@ -158,131 +133,199 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
 ; RV64-NEXT:    xor a2, a2, a3
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a2, a0
-; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    xor a0, a3, a0
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_ctselect_icmp_eq:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    xor a0, a0, a1
 ; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    xor a2, a2, a3
 ; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a2, a0, a2
-; RV32-NEXT:    not a0, a0
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    xor a0, a3, a0
 ; RV32-NEXT:    ret
   %cond = icmp eq i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
 }
-
-define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
-; RV64-LABEL: test_ctselect_icmp_ne:
+define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+; RV64-LABEL: test_ctselect_icmp_ult:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    sext.w a1, a1
 ; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    xor a2, a2, a3
-; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    neg a0, a0
 ; RV64-NEXT:    and a0, a2, a0
-; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    xor a0, a3, a0
 ; RV64-NEXT:    ret
 ;
-; RV32-LABEL: test_ctselect_icmp_ne:
+; RV32-LABEL: test_ctselect_icmp_ult:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    seqz a0, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a2, a0, a2
-; RV32-NEXT:    not a0, a0
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    or a0, a2, a0
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    xor a0, a3, a0
 ; RV32-NEXT:    ret
-  %cond = icmp ne i32 %x, %y
+  %cond = icmp ult i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
 }
 
-define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
-; RV64-LABEL: test_ctselect_icmp_slt:
+; Test with memory operands
+define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+; RV64-LABEL: test_ctselect_load:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    slt a0, a0, a1
+; RV64-NEXT:    lw a1, 0(a1)
+; RV64-NEXT:    lw a2, 0(a2)
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_load:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a1, 0(a1)
+; RV32-NEXT:    lw a2, 0(a2)
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a0, a2, a0
+; RV32-NEXT:    ret
+  %a = load i32, ptr %p1
+  %b = load i32, ptr %p2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test nested CTSELECT pattern with AND merging on i1 values
+; Pattern: ctselect C0, (ctselect C1, X, Y), Y -> ctselect (C0 & C1), X, Y
+define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+; RV64-LABEL: test_ctselect_nested_and_i1_to_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    and a0, a1, a0
 ; RV64-NEXT:    xor a2, a2, a3
-; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    and a0, a2, a0
-; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    xor a0, a3, a0
 ; RV64-NEXT:    ret
 ;
-; RV32-LABEL: test_ctselect_icmp_slt:
+; RV32-LABEL: test_ctselect_nested_and_i1_to_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    slt a0, a0, a1
-; RV32-NEXT:    neg a1, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    xor a0, a3, a0
 ; RV32-NEXT:    ret
-  %cond = icmp slt i32 %x, %y
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
+  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner, i1 false)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
   ret i32 %result
 }
 
-define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
-; RV64-LABEL: test_ctselect_icmp_ult:
+; Test nested CTSELECT pattern with OR merging on i1 values
+; Pattern: ctselect C0, X, (ctselect C1, X, Y) -> ctselect (C0 | C1), X, Y
+define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+; RV64-LABEL: test_ctselect_nested_or_i1_to_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    sext.w a1, a1
-; RV64-NEXT:    sext.w a0, a0
-; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    or a0, a0, a1
 ; RV64-NEXT:    xor a2, a2, a3
-; RV64-NEXT:    neg a0, a0
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    and a0, a2, a0
-; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    xor a0, a3, a0
 ; RV64-NEXT:    ret
 ;
-; RV32-LABEL: test_ctselect_icmp_ult:
+; RV32-LABEL: test_ctselect_nested_or_i1_to_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    neg a1, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a1, a1, a2
-; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a2, a0
+; RV32-NEXT:    xor a0, a3, a0
 ; RV32-NEXT:    ret
-  %cond = icmp ult i32 %x, %y
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
+  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 true, i1 %inner)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
   ret i32 %result
 }
 
-; Test with memory operands
-define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
-; RV64-LABEL: test_ctselect_load:
+; Test double nested CTSELECT with recursive AND merging
+; Pattern: ctselect C0, (ctselect C1, (ctselect C2, X, Y), Y), Y
+;   -> ctselect (C0 & C1 & C2), X, Y
+define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) {
+; RV64-LABEL: test_ctselect_double_nested_and_i1:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lw a1, 0(a1)
-; RV64-NEXT:    lw a2, 0(a2)
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a3, a3, a4
 ; RV64-NEXT:    slli a0, a0, 63
-; RV64-NEXT:    xor a1, a1, a2
 ; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a3, a0
+; RV64-NEXT:    xor a0, a4, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_double_nested_and_i1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a3, a3, a4
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a3, a0
+; RV32-NEXT:    xor a0, a4, a0
+; RV32-NEXT:    ret
+  %inner2 = call i1 @llvm.ct.select.i1(i1 %c2, i1 true, i1 false)
+  %inner1 = call i1 @llvm.ct.select.i1(i1 %c1, i1 %inner2, i1 false)
+  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner1, i1 false)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test double nested CTSELECT with mixed AND/OR patterns
+define i32 @test_ctselect_double_nested_mixed_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y, i32 %z) {
+; RV64-LABEL: test_ctselect_double_nested_mixed_i1:
+; RV64:       # %bb.0:
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    xor a0, a0, a2
+; RV64-NEXT:    xor a3, a3, a4
+; RV64-NEXT:    or a0, a0, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a3, a3, a0
+; RV64-NEXT:    xor a4, a4, a5
+; RV64-NEXT:    xor a3, a4, a3
+; RV64-NEXT:    and a0, a3, a0
+; RV64-NEXT:    xor a0, a5, a0
 ; RV64-NEXT:    ret
 ;
-; RV32-LABEL: test_ctselect_load:
+; RV32-LABEL: test_ctselect_double_nested_mixed_i1:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lw a1, 0(a1)
-; RV32-NEXT:    lw a2, 0(a2)
-; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    neg a3, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a1, a3, a1
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a3, a3, a4
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a3, a3, a0
+; RV32-NEXT:    xor a4, a4, a5
+; RV32-NEXT:    xor a3, a4, a3
+; RV32-NEXT:    and a0, a3, a0
+; RV32-NEXT:    xor a0, a5, a0
 ; RV32-NEXT:    ret
-  %a = load i32, ptr %p1
-  %b = load i32, ptr %p2
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  %inner1 = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
+  %and_cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner1, i1 false)
+  %inner2 = call i1 @llvm.ct.select.i1(i1 %c2, i1 true, i1 false)
+  %or_cond = call i1 @llvm.ct.select.i1(i1 %and_cond, i1 true, i1 %inner2)
+  %inner_result = call i32 @llvm.ct.select.i32(i1 %or_cond, i32 %x, i32 %y)
+  %result = call i32 @llvm.ct.select.i32(i1 %or_cond, i32 %inner_result, i32 %z)
   ret i32 %result
 }
 
@@ -296,35 +339,416 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
 ; RV64-NEXT:    slli a0, a0, 63
 ; RV64-NEXT:    srai a1, a1, 63
 ; RV64-NEXT:    and a1, a2, a1
-; RV64-NEXT:    xor a1, a1, a3
+; RV64-NEXT:    xor a1, a3, a1
 ; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    and a0, a1, a0
-; RV64-NEXT:    xor a0, a0, a4
+; RV64-NEXT:    xor a0, a4, a0
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_ctselect_nested:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    andi a1, a1, 1
-; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    neg a5, a1
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a2, a5, a2
-; RV32-NEXT:    neg a5, a0
-; RV32-NEXT:    addi a0, a0, -1
-; RV32-NEXT:    and a1, a1, a3
-; RV32-NEXT:    or a1, a2, a1
-; RV32-NEXT:    and a1, a5, a1
-; RV32-NEXT:    and a0, a0, a4
-; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    xor a3, a3, a4
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a1, a1, 31
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    xor a1, a3, a1
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a0, a4, a0
 ; RV32-NEXT:    ret
   %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
   %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
   ret i32 %result
 }
 
+; Test floating-point ct.select selecting between NaN and Inf
+define float @test_ctselect_f32_nan_inf(i1 %cond) {
+; RV64-LABEL: test_ctselect_f32_nan_inf:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    lui a1, 1024
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    lui a1, 522240
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f32_nan_inf:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    lui a1, 1024
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    lui a1, 522240
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    ret
+  %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
+  ret float %result
+}
+
+define double @test_ctselect_f64_nan_inf(i1 %cond) {
+; RV64-LABEL: test_ctselect_f64_nan_inf:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    slli a1, a1, 51
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    li a1, 2047
+; RV64-NEXT:    slli a1, a1, 52
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f64_nan_inf:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    lui a1, 128
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    lui a1, 524032
+; RV32-NEXT:    or a1, a0, a1
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
+  %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000)
+  ret double %result
+}
+
+; Test basic floating-point ct.select
+define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
+; RV64-LABEL: test_ctselect_f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    xor a0, a2, a0
+; RV32-NEXT:    ret
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
+; RV64-LABEL: test_ctselect_f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    xor a1, a1, a2
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a1, a1, a3
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    xor a2, a2, a4
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    and a1, a1, a0
+; RV32-NEXT:    and a2, a2, a0
+; RV32-NEXT:    xor a0, a3, a1
+; RV32-NEXT:    xor a1, a4, a2
+; RV32-NEXT:    ret
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+; Test vector ct.select with integer vectors
+define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; RV64-LABEL: test_ctselect_v4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a4, 0(a3)
+; RV64-NEXT:    lw a5, 8(a3)
+; RV64-NEXT:    lw a6, 16(a3)
+; RV64-NEXT:    lw a3, 24(a3)
+; RV64-NEXT:    lw a7, 0(a2)
+; RV64-NEXT:    lw t0, 8(a2)
+; RV64-NEXT:    lw t1, 16(a2)
+; RV64-NEXT:    lw a2, 24(a2)
+; RV64-NEXT:    slli a1, a1, 63
+; RV64-NEXT:    srai a1, a1, 63
+; RV64-NEXT:    xor a7, a7, a4
+; RV64-NEXT:    xor t0, t0, a5
+; RV64-NEXT:    xor t1, t1, a6
+; RV64-NEXT:    xor a2, a2, a3
+; RV64-NEXT:    and a7, a7, a1
+; RV64-NEXT:    and t0, t0, a1
+; RV64-NEXT:    and t1, t1, a1
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    xor a2, a4, a7
+; RV64-NEXT:    xor a4, a5, t0
+; RV64-NEXT:    xor a5, a6, t1
+; RV64-NEXT:    xor a1, a3, a1
+; RV64-NEXT:    sw a2, 0(a0)
+; RV64-NEXT:    sw a4, 4(a0)
+; RV64-NEXT:    sw a5, 8(a0)
+; RV64-NEXT:    sw a1, 12(a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_v4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a4, 0(a3)
+; RV32-NEXT:    lw a5, 4(a3)
+; RV32-NEXT:    lw a6, 8(a3)
+; RV32-NEXT:    lw a3, 12(a3)
+; RV32-NEXT:    lw a7, 0(a2)
+; RV32-NEXT:    lw t0, 4(a2)
+; RV32-NEXT:    lw t1, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
+; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    srai a1, a1, 31
+; RV32-NEXT:    xor a7, a7, a4
+; RV32-NEXT:    xor t0, t0, a5
+; RV32-NEXT:    xor t1, t1, a6
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    and a7, a7, a1
+; RV32-NEXT:    and t0, t0, a1
+; RV32-NEXT:    and t1, t1, a1
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    xor a2, a4, a7
+; RV32-NEXT:    xor a4, a5, t0
+; RV32-NEXT:    xor a5, a6, t1
+; RV32-NEXT:    xor a1, a3, a1
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a5, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    ret
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; RV64-LABEL: test_ctselect_v4f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a4, 0(a3)
+; RV64-NEXT:    lw a5, 8(a3)
+; RV64-NEXT:    lw a6, 16(a3)
+; RV64-NEXT:    lw a3, 24(a3)
+; RV64-NEXT:    lw a7, 0(a2)
+; RV64-NEXT:    lw t0, 8(a2)
+; RV64-NEXT:    lw t1, 16(a2)
+; RV64-NEXT:    lw a2, 24(a2)
+; RV64-NEXT:    slli a1, a1, 63
+; RV64-NEXT:    srai a1, a1, 63
+; RV64-NEXT:    xor a7, a7, a4
+; RV64-NEXT:    xor t0, t0, a5
+; RV64-NEXT:    xor t1, t1, a6
+; RV64-NEXT:    xor a2, a2, a3
+; RV64-NEXT:    and a7, a7, a1
+; RV64-NEXT:    and t0, t0, a1
+; RV64-NEXT:    and t1, t1, a1
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    xor a2, a4, a7
+; RV64-NEXT:    xor a4, a5, t0
+; RV64-NEXT:    xor a5, a6, t1
+; RV64-NEXT:    xor a1, a3, a1
+; RV64-NEXT:    sw a2, 0(a0)
+; RV64-NEXT:    sw a4, 4(a0)
+; RV64-NEXT:    sw a5, 8(a0)
+; RV64-NEXT:    sw a1, 12(a0)
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_v4f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a4, 0(a3)
+; RV32-NEXT:    lw a5, 4(a3)
+; RV32-NEXT:    lw a6, 8(a3)
+; RV32-NEXT:    lw a3, 12(a3)
+; RV32-NEXT:    lw a7, 0(a2)
+; RV32-NEXT:    lw t0, 4(a2)
+; RV32-NEXT:    lw t1, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
+; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    srai a1, a1, 31
+; RV32-NEXT:    xor a7, a7, a4
+; RV32-NEXT:    xor t0, t0, a5
+; RV32-NEXT:    xor t1, t1, a6
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    and a7, a7, a1
+; RV32-NEXT:    and t0, t0, a1
+; RV32-NEXT:    and t1, t1, a1
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    xor a2, a4, a7
+; RV32-NEXT:    xor a4, a5, t0
+; RV32-NEXT:    xor a5, a6, t1
+; RV32-NEXT:    xor a1, a3, a1
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a5, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    ret
+  %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) {
+; RV64-LABEL: test_ctselect_v8i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -32
+; RV64-NEXT:    .cfi_def_cfa_offset 32
+; RV64-NEXT:    sd s0, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset s0, -8
+; RV64-NEXT:    .cfi_offset s1, -16
+; RV64-NEXT:    .cfi_offset s2, -24
+; RV64-NEXT:    lw a7, 32(a3)
+; RV64-NEXT:    lw a6, 40(a3)
+; RV64-NEXT:    lw a5, 48(a3)
+; RV64-NEXT:    lw a4, 56(a3)
+; RV64-NEXT:    lw t0, 32(a2)
+; RV64-NEXT:    lw t1, 40(a2)
+; RV64-NEXT:    lw t2, 48(a2)
+; RV64-NEXT:    lw t3, 56(a2)
+; RV64-NEXT:    lw t4, 0(a3)
+; RV64-NEXT:    lw t5, 8(a3)
+; RV64-NEXT:    lw t6, 16(a3)
+; RV64-NEXT:    lw a3, 24(a3)
+; RV64-NEXT:    lw s0, 0(a2)
+; RV64-NEXT:    lw s1, 8(a2)
+; RV64-NEXT:    lw s2, 16(a2)
+; RV64-NEXT:    lw a2, 24(a2)
+; RV64-NEXT:    slli a1, a1, 63
+; RV64-NEXT:    srai a1, a1, 63
+; RV64-NEXT:    xor s0, s0, t4
+; RV64-NEXT:    xor s1, s1, t5
+; RV64-NEXT:    xor s2, s2, t6
+; RV64-NEXT:    xor a2, a2, a3
+; RV64-NEXT:    xor t0, t0, a7
+; RV64-NEXT:    xor t1, t1, a6
+; RV64-NEXT:    xor t2, t2, a5
+; RV64-NEXT:    xor t3, t3, a4
+; RV64-NEXT:    and s0, s0, a1
+; RV64-NEXT:    and s1, s1, a1
+; RV64-NEXT:    and s2, s2, a1
+; RV64-NEXT:    and a2, a2, a1
+; RV64-NEXT:    and t0, t0, a1
+; RV64-NEXT:    and t1, t1, a1
+; RV64-NEXT:    and t2, t2, a1
+; RV64-NEXT:    and a1, t3, a1
+; RV64-NEXT:    xor t3, t4, s0
+; RV64-NEXT:    xor t4, t5, s1
+; RV64-NEXT:    xor t5, t6, s2
+; RV64-NEXT:    xor a2, a3, a2
+; RV64-NEXT:    xor a3, a7, t0
+; RV64-NEXT:    xor a6, a6, t1
+; RV64-NEXT:    xor a5, a5, t2
+; RV64-NEXT:    xor a1, a4, a1
+; RV64-NEXT:    sw a3, 16(a0)
+; RV64-NEXT:    sw a6, 20(a0)
+; RV64-NEXT:    sw a5, 24(a0)
+; RV64-NEXT:    sw a1, 28(a0)
+; RV64-NEXT:    sw t3, 0(a0)
+; RV64-NEXT:    sw t4, 4(a0)
+; RV64-NEXT:    sw t5, 8(a0)
+; RV64-NEXT:    sw a2, 12(a0)
+; RV64-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    .cfi_restore s1
+; RV64-NEXT:    .cfi_restore s2
+; RV64-NEXT:    addi sp, sp, 32
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_ctselect_v8i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset s0, -4
+; RV32-NEXT:    .cfi_offset s1, -8
+; RV32-NEXT:    .cfi_offset s2, -12
+; RV32-NEXT:    lw a7, 16(a3)
+; RV32-NEXT:    lw a6, 20(a3)
+; RV32-NEXT:    lw a5, 24(a3)
+; RV32-NEXT:    lw a4, 28(a3)
+; RV32-NEXT:    lw t0, 16(a2)
+; RV32-NEXT:    lw t1, 20(a2)
+; RV32-NEXT:    lw t2, 24(a2)
+; RV32-NEXT:    lw t3, 28(a2)
+; RV32-NEXT:    lw t4, 0(a3)
+; RV32-NEXT:    lw t5, 4(a3)
+; RV32-NEXT:    lw t6, 8(a3)
+; RV32-NEXT:    lw a3, 12(a3)
+; RV32-NEXT:    lw s0, 0(a2)
+; RV32-NEXT:    lw s1, 4(a2)
+; RV32-NEXT:    lw s2, 8(a2)
+; RV32-NEXT:    lw a2, 12(a2)
+; RV32-NEXT:    slli a1, a1, 31
+; RV32-NEXT:    srai a1, a1, 31
+; RV32-NEXT:    xor s0, s0, t4
+; RV32-NEXT:    xor s1, s1, t5
+; RV32-NEXT:    xor s2, s2, t6
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    xor t0, t0, a7
+; RV32-NEXT:    xor t1, t1, a6
+; RV32-NEXT:    xor t2, t2, a5
+; RV32-NEXT:    xor t3, t3, a4
+; RV32-NEXT:    and s0, s0, a1
+; RV32-NEXT:    and s1, s1, a1
+; RV32-NEXT:    and s2, s2, a1
+; RV32-NEXT:    and a2, a2, a1
+; RV32-NEXT:    and t0, t0, a1
+; RV32-NEXT:    and t1, t1, a1
+; RV32-NEXT:    and t2, t2, a1
+; RV32-NEXT:    and a1, t3, a1
+; RV32-NEXT:    xor t3, t4, s0
+; RV32-NEXT:    xor t4, t5, s1
+; RV32-NEXT:    xor t5, t6, s2
+; RV32-NEXT:    xor a2, a3, a2
+; RV32-NEXT:    xor a3, a7, t0
+; RV32-NEXT:    xor a6, a6, t1
+; RV32-NEXT:    xor a5, a5, t2
+; RV32-NEXT:    xor a1, a4, a1
+; RV32-NEXT:    sw a3, 16(a0)
+; RV32-NEXT:    sw a6, 20(a0)
+; RV32-NEXT:    sw a5, 24(a0)
+; RV32-NEXT:    sw a1, 28(a0)
+; RV32-NEXT:    sw t3, 0(a0)
+; RV32-NEXT:    sw t4, 4(a0)
+; RV32-NEXT:    sw t5, 8(a0)
+; RV32-NEXT:    sw a2, 12(a0)
+; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    .cfi_restore s1
+; RV32-NEXT:    .cfi_restore s2
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+  %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %result
+}
+
 ; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
 declare i8 @llvm.ct.select.i8(i1, i8, i8)
 declare i16 @llvm.ct.select.i16(i1, i16, i16)
 declare i32 @llvm.ct.select.i32(i1, i32, i32)
 declare i64 @llvm.ct.select.i64(i1, i64, i64)
 declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+
+; Vector intrinsics
+declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
+declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
+declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)
diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
index 095787a5e2a4b..2b6091c880637 100644
--- a/llvm/test/CodeGen/X86/ctselect.ll
+++ b/llvm/test/CodeGen/X86/ctselect.ll
@@ -8,120 +8,75 @@
 define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
 ; X64-LABEL: test_ctselect_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leal -1(%rdi), %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    negb %cl
-; X64-NEXT:    andb %sil, %cl
-; X64-NEXT:    andb %dl, %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %edx, %esi
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    negb %al
+; X64-NEXT:    andb %sil, %al
+; X64-NEXT:    xorb %dl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i8:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorb %cl, %dl
 ; X32-NEXT:    andb $1, %al
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negb %cl
-; X32-NEXT:    andb {{[0-9]+}}(%esp), %cl
-; X32-NEXT:    decb %al
-; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    orb %cl, %al
+; X32-NEXT:    negb %al
+; X32-NEXT:    andb %dl, %al
+; X32-NEXT:    xorb %cl, %al
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_i8:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorb %cl, %dl
 ; X32-NOCMOV-NEXT:    andb $1, %al
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negb %cl
-; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %cl
-; X32-NOCMOV-NEXT:    decb %al
-; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT:    orb %cl, %al
+; X32-NOCMOV-NEXT:    negb %al
+; X32-NOCMOV-NEXT:    andb %dl, %al
+; X32-NOCMOV-NEXT:    xorb %cl, %al
 ; X32-NOCMOV-NEXT:    retl
   %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
   ret i8 %result
 }
 
-define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
-; X64-LABEL: test_ctselect_i16:
+define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leal -1(%rdi), %ecx
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %edx, %esi
+; X64-NEXT:    andl $1, %eax
 ; X64-NEXT:    negl %eax
 ; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    andl %edx, %ecx
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_i16:
-; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    leal -1(%eax), %ecx
-; X32-NEXT:    andw {{[0-9]+}}(%esp), %cx
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andw {{[0-9]+}}(%esp), %ax
-; X32-NEXT:    orl %ecx, %eax
-; X32-NEXT:    # kill: def $ax killed $ax killed $eax
-; X32-NEXT:    retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_i16:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    leal -1(%eax), %ecx
-; X32-NOCMOV-NEXT:    andw {{[0-9]+}}(%esp), %cx
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andw {{[0-9]+}}(%esp), %ax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
-; X32-NOCMOV-NEXT:    # kill: def $ax killed $ax killed $eax
-; X32-NOCMOV-NEXT:    retl
-  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
-  ret i16 %result
-}
-
-define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
-; X64-LABEL: test_ctselect_i32:
-; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leal -1(%rdi), %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl %esi, %ecx
-; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    xorl %edx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i32:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
 ; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_i32:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
 ; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
@@ -130,13 +85,12 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
 define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 ; X64-LABEL: test_ctselect_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leaq -1(%rdi), %rax
-; X64-NEXT:    negq %rdi
-; X64-NEXT:    andq %rsi, %rdi
-; X64-NEXT:    andq %rdx, %rax
-; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorq %rdx, %rsi
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    xorq %rdx, %rax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i64:
@@ -190,14 +144,12 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
 ; X64-LABEL: test_ctselect_f32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movd %xmm1, %eax
+; X64-NEXT:    pxor %xmm1, %xmm0
 ; X64-NEXT:    movd %xmm0, %ecx
 ; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    movl %edi, %edx
-; X64-NEXT:    negl %edx
-; X64-NEXT:    andl %ecx, %edx
-; X64-NEXT:    decl %edi
-; X64-NEXT:    andl %eax, %edi
-; X64-NEXT:    orl %edx, %edi
+; X64-NEXT:    negl %edi
+; X64-NEXT:    andl %ecx, %edi
+; X64-NEXT:    xorl %eax, %edi
 ; X64-NEXT:    movd %edi, %xmm0
 ; X64-NEXT:    retq
 ;
@@ -206,13 +158,13 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
 ; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    movl %eax, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -224,13 +176,13 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
 ; X32-NOCMOV-NEXT:    pushl %eax
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
 ; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    movl %eax, (%esp)
 ; X32-NOCMOV-NEXT:    flds (%esp)
 ; X32-NOCMOV-NEXT:    popl %eax
@@ -245,14 +197,12 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movq %xmm1, %rax
+; X64-NEXT:    pxor %xmm1, %xmm0
 ; X64-NEXT:    movq %xmm0, %rcx
 ; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    movq %rdi, %rdx
-; X64-NEXT:    negq %rdx
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    decq %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    orq %rdx, %rdi
+; X64-NEXT:    negq %rdi
+; X64-NEXT:    andq %rcx, %rdi
+; X64-NEXT:    xorq %rax, %rdi
 ; X64-NEXT:    movq %rdi, %xmm0
 ; X64-NEXT:    retq
 ;
@@ -320,37 +270,36 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
 define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
 ; X64-LABEL: test_ctselect_ptr:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leaq -1(%rdi), %rax
-; X64-NEXT:    negq %rdi
-; X64-NEXT:    andq %rsi, %rdi
-; X64-NEXT:    andq %rdx, %rax
-; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorq %rdx, %rsi
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    andq %rsi, %rax
+; X64-NEXT:    xorq %rdx, %rax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_ptr:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
 ; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_ptr:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
 ; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
   ret ptr %result
@@ -361,16 +310,24 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_const_true:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    xorl %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_const_true:
 ; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_const_true:
 ; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
   ret i32 %result
@@ -385,13 +342,13 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
 ; X32-LABEL: test_ctselect_const_false:
 ; X32:       # %bb.0:
 ; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_const_false:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
   ret i32 %result
@@ -404,174 +361,79 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %esi, %edi
 ; X64-NEXT:    sete %al
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    decl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    xorl %ecx, %edx
+; X64-NEXT:    negl %eax
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    xorl %ecx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_icmp_eq:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    sete %al
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_icmp_eq:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    sete %al
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %cond = icmp eq i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
 }
 
-define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
-; X64-LABEL: test_ctselect_icmp_ne:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    setne %al
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    decl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_icmp_ne:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    setne %al
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_icmp_ne:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    setne %al
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
-; X32-NOCMOV-NEXT:    retl
-  %cond = icmp ne i32 %x, %y
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
-  ret i32 %result
-}
-
-define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
-; X64-LABEL: test_ctselect_icmp_slt:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    setl %al
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    decl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_icmp_slt:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    setl %al
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_icmp_slt:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    setl %al
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
-; X32-NOCMOV-NEXT:    retl
-  %cond = icmp slt i32 %x, %y
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
-  ret i32 %result
-}
-
 define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_icmp_ult:
 ; X64:       # %bb.0:
+; X64-NEXT:    xorl %ecx, %edx
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpl %esi, %edi
 ; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl %eax, %edx
-; X64-NEXT:    notl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    orl %edx, %eax
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    xorl %ecx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_icmp_ult:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    andl %eax, %ecx
-; X32-NEXT:    notl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    sbbl %edx, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_icmp_ult:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    sbbl %eax, %eax
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    andl %eax, %ecx
-; X32-NOCMOV-NEXT:    notl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl %edx, %edx
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    sbbl %edx, %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %cond = icmp ult i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -583,12 +445,10 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movd %xmm3, %eax
 ; X64-NEXT:    cmpeqss %xmm1, %xmm0
-; X64-NEXT:    movd %xmm0, %ecx
-; X64-NEXT:    pand %xmm2, %xmm0
-; X64-NEXT:    movd %xmm0, %edx
-; X64-NEXT:    notl %ecx
-; X64-NEXT:    andl %eax, %ecx
-; X64-NEXT:    orl %edx, %ecx
+; X64-NEXT:    pxor %xmm3, %xmm2
+; X64-NEXT:    pand %xmm0, %xmm2
+; X64-NEXT:    movd %xmm2, %ecx
+; X64-NEXT:    xorl %eax, %ecx
 ; X64-NEXT:    movd %ecx, %xmm0
 ; X64-NEXT:    retq
 ;
@@ -596,21 +456,21 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NEXT:    fucompi %st(1), %st
 ; X32-NEXT:    fstp %st(0)
-; X32-NEXT:    setnp %al
-; X32-NEXT:    sete %cl
-; X32-NEXT:    andb %al, %cl
-; X32-NEXT:    movzbl %cl, %eax
-; X32-NEXT:    movl %eax, %ecx
+; X32-NEXT:    setnp %cl
+; X32-NEXT:    sete %dl
+; X32-NEXT:    andb %cl, %dl
+; X32-NEXT:    movzbl %dl, %ecx
 ; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
-; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %eax, %edx
+; X32-NEXT:    andl %ecx, %edx
+; X32-NEXT:    xorl %eax, %edx
+; X32-NEXT:    movl %edx, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
 ; X32-NEXT:    .cfi_def_cfa_offset 4
@@ -620,6 +480,7 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    pushl %eax
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NOCMOV-NEXT:    fucompp
@@ -627,16 +488,15 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
 ; X32-NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
 ; X32-NOCMOV-NEXT:    sahf
 ; X32-NOCMOV-NEXT:    setnp %al
-; X32-NOCMOV-NEXT:    sete %cl
-; X32-NOCMOV-NEXT:    andb %al, %cl
-; X32-NOCMOV-NEXT:    movzbl %cl, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
-; X32-NOCMOV-NEXT:    movl %eax, (%esp)
+; X32-NOCMOV-NEXT:    sete %dl
+; X32-NOCMOV-NEXT:    andb %al, %dl
+; X32-NOCMOV-NEXT:    movzbl %dl, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    andl %eax, %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    movl %edx, (%esp)
 ; X32-NOCMOV-NEXT:    flds (%esp)
 ; X32-NOCMOV-NEXT:    popl %eax
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
@@ -650,52 +510,41 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
 define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
 ; X64-LABEL: test_ctselect_load:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movl (%rdx), %ecx
+; X64-NEXT:    movl (%rsi), %eax
+; X64-NEXT:    xorl %ecx, %eax
 ; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leal -1(%rdi), %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl (%rsi), %ecx
-; X64-NEXT:    andl (%rdx), %eax
-; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    negl %edi
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    xorl %ecx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_load:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%edx), %edx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    xorl %edx, %ecx
 ; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    negl %esi
-; X32-NEXT:    andl (%edx), %esi
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl (%ecx), %eax
-; X32-NEXT:    orl %esi, %eax
-; X32-NEXT:    popl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %ecx, %eax
+; X32-NEXT:    xorl %edx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_load:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -8
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl (%edx), %edx
+; X32-NOCMOV-NEXT:    movl (%ecx), %ecx
+; X32-NOCMOV-NEXT:    xorl %edx, %ecx
 ; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %esi
-; X32-NOCMOV-NEXT:    negl %esi
-; X32-NOCMOV-NEXT:    andl (%edx), %esi
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl (%ecx), %eax
-; X32-NOCMOV-NEXT:    orl %esi, %eax
-; X32-NOCMOV-NEXT:    popl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %ecx, %eax
+; X32-NOCMOV-NEXT:    xorl %edx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %a = load i32, ptr %p1
   %b = load i32, ptr %p2
@@ -707,69 +556,753 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
 define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
 ; X64-LABEL: test_ctselect_nested:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    xorl %ecx, %edx
 ; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    leal -1(%rsi), %r9d
-; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    negl %esi
+; X64-NEXT:    andl %edx, %esi
+; X64-NEXT:    xorl %r8d, %ecx
+; X64-NEXT:    xorl %esi, %ecx
+; X64-NEXT:    andl $1, %eax
 ; X64-NEXT:    negl %eax
-; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    andl %ecx, %r9d
-; X64-NEXT:    orl %eax, %r9d
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leal -1(%rdi), %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl %r9d, %ecx
-; X64-NEXT:    andl %r8d, %eax
-; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    andl %ecx, %eax
+; X64-NEXT:    xorl %r8d, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_nested:
 ; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    negl %edx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    decl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    orl %edx, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    xorl %edx, %edi
+; X32-NEXT:    andl $1, %esi
+; X32-NEXT:    negl %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    xorl %esi, %edx
 ; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    negl %edx
-; X32-NEXT:    andl %ecx, %edx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %edx, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_nested:
 ; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    andl $1, %ecx
-; X32-NOCMOV-NEXT:    movl %ecx, %edx
-; X32-NOCMOV-NEXT:    negl %edx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    decl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    orl %edx, %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    xorl %edx, %edi
+; X32-NOCMOV-NEXT:    andl $1, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    xorl %esi, %edx
 ; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %edx
-; X32-NOCMOV-NEXT:    negl %edx
-; X32-NOCMOV-NEXT:    andl %ecx, %edx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %edx, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
   %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
   ret i32 %result
 }
 
+; Test nested CTSELECT pattern with AND merging on i1 values
+; Pattern: ctselect C0, (ctselect C1, X, Y), Y -> ctselect (C0 & C1), X, Y
+; This optimization only applies when selecting between i1 values (boolean logic)
+define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+; X64-LABEL: test_ctselect_nested_and_i1_to_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    xorl %ecx, %edx
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    xorl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_nested_and_i1_to_i32:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_nested_and_i1_to_i32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
+  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner, i1 false)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test nested CTSELECT pattern with OR merging on i1 values
+; Pattern: ctselect C0, X, (ctselect C1, X, Y) -> ctselect (C0 | C1), X, Y
+; This optimization only applies when selecting between i1 values (boolean logic)
+define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+; X64-LABEL: test_ctselect_nested_or_i1_to_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    xorl %ecx, %edx
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    xorl %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_nested_or_i1_to_i32:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    orb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_nested_or_i1_to_i32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    orb {{[0-9]+}}(%esp), %al
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
+  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 true, i1 %inner)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test double nested CTSELECT with recursive AND merging
+; Pattern: ctselect C0, (ctselect C1, (ctselect C2, X, Y), Y), Y
+;   -> ctselect C0, (ctselect (C1 & C2), X, Y), Y
+;   -> ctselect (C0 & (C1 & C2)), X, Y
+; This tests that the optimization can be applied recursively
+define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) {
+; X64-LABEL: test_ctselect_double_nested_and_i1:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    xorl %r8d, %ecx
+; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    andl %ecx, %eax
+; X64-NEXT:    xorl %r8d, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_double_nested_and_i1:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_double_nested_and_i1:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    retl
+  %inner2 = call i1 @llvm.ct.select.i1(i1 %c2, i1 true, i1 false)
+  %inner1 = call i1 @llvm.ct.select.i1(i1 %c1, i1 %inner2, i1 false)
+  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner1, i1 false)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Vector CTSELECT Tests
+; ============================================================================
+
+; Test vector CTSELECT with v4i32 (128-bit vector with single i1 mask)
+; NOW CONSTANT-TIME: Uses bitwise XOR/AND operations instead of branches!
+define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; X64-LABEL: test_ctselect_v4i32:
+; X64:       # %bb.0:
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    movd %edi, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X64-NEXT:    pslld $31, %xmm2
+; X64-NEXT:    psrad $31, %xmm2
+; X64-NEXT:    pand %xmm2, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_v4i32:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    .cfi_offset %esi, -20
+; X32-NEXT:    .cfi_offset %edi, -16
+; X32-NEXT:    .cfi_offset %ebx, -12
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ebx, %edx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    andl $1, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    andl %edi, %edx
+; X32-NEXT:    xorl %ebx, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    andl %edi, %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    xorl %esi, %ebp
+; X32-NEXT:    andl %edi, %ebp
+; X32-NEXT:    xorl %esi, %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    xorl %ecx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    xorl %ecx, %esi
+; X32-NEXT:    movl %esi, 12(%eax)
+; X32-NEXT:    movl %ebp, 8(%eax)
+; X32-NEXT:    movl %ebx, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl $4
+;
+; X32-NOCMOV-LABEL: test_ctselect_v4i32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ebx, %edx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    andl $1, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    andl %edi, %edx
+; X32-NOCMOV-NEXT:    xorl %ebx, %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    andl %edi, %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    xorl %esi, %ebp
+; X32-NOCMOV-NEXT:    andl %edi, %ebp
+; X32-NOCMOV-NEXT:    xorl %esi, %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    xorl %ecx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    xorl %ecx, %esi
+; X32-NOCMOV-NEXT:    movl %esi, 12(%eax)
+; X32-NOCMOV-NEXT:    movl %ebp, 8(%eax)
+; X32-NOCMOV-NEXT:    movl %ebx, 4(%eax)
+; X32-NOCMOV-NEXT:    movl %edx, (%eax)
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl $4
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; X64-LABEL: test_ctselect_v4f32:
+; X64:       # %bb.0:
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    movd %edi, %xmm2
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; X64-NEXT:    pslld $31, %xmm2
+; X64-NEXT:    psrad $31, %xmm2
+; X64-NEXT:    pand %xmm2, %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_v4f32:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    .cfi_offset %esi, -20
+; X32-NEXT:    .cfi_offset %edi, -16
+; X32-NEXT:    .cfi_offset %ebx, -12
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    xorl %ebx, %edx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    andl $1, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    andl %edi, %edx
+; X32-NEXT:    xorl %ebx, %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    andl %edi, %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    xorl %esi, %ebp
+; X32-NEXT:    andl %edi, %ebp
+; X32-NEXT:    xorl %esi, %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    xorl %ecx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    xorl %ecx, %esi
+; X32-NEXT:    movl %esi, 12(%eax)
+; X32-NEXT:    movl %ebp, 8(%eax)
+; X32-NEXT:    movl %ebx, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl $4
+;
+; X32-NOCMOV-LABEL: test_ctselect_v4f32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ebx, %edx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    andl $1, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    andl %edi, %edx
+; X32-NOCMOV-NEXT:    xorl %ebx, %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    andl %edi, %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    xorl %esi, %ebp
+; X32-NOCMOV-NEXT:    andl %edi, %ebp
+; X32-NOCMOV-NEXT:    xorl %esi, %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    xorl %ecx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    xorl %ecx, %esi
+; X32-NOCMOV-NEXT:    movl %esi, 12(%eax)
+; X32-NOCMOV-NEXT:    movl %ebp, 8(%eax)
+; X32-NOCMOV-NEXT:    movl %ebx, 4(%eax)
+; X32-NOCMOV-NEXT:    movl %edx, (%eax)
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl $4
+  %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <8 x i32> @test_ctselect_v8i32_avx(i1 %cond, <8 x i32> %a, <8 x i32> %b) {
+; X64-LABEL: test_ctselect_v8i32_avx:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %edi, %xmm4
+; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; X64-NEXT:    pslld $31, %xmm4
+; X64-NEXT:    psrad $31, %xmm4
+; X64-NEXT:    movdqa %xmm4, %xmm5
+; X64-NEXT:    pandn %xmm2, %xmm5
+; X64-NEXT:    pand %xmm4, %xmm0
+; X64-NEXT:    por %xmm5, %xmm0
+; X64-NEXT:    pand %xmm4, %xmm1
+; X64-NEXT:    pandn %xmm3, %xmm4
+; X64-NEXT:    por %xmm4, %xmm1
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_v8i32_avx:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 28
+; X32-NEXT:    .cfi_offset %esi, -20
+; X32-NEXT:    .cfi_offset %edi, -16
+; X32-NEXT:    .cfi_offset %ebx, -12
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    xorl %eax, %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    andl $1, %edx
+; X32-NEXT:    negl %edx
+; X32-NEXT:    andl %edx, %ecx
+; X32-NEXT:    xorl %eax, %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %esi, %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %esi, %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    xorl %ebx, %esi
+; X32-NEXT:    andl %edx, %esi
+; X32-NEXT:    xorl %ebx, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    andl %edx, %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    xorl %edi, %ebp
+; X32-NEXT:    andl %edx, %ebp
+; X32-NEXT:    xorl %edi, %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    xorl %eax, %edi
+; X32-NEXT:    andl %edx, %edi
+; X32-NEXT:    xorl %eax, %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    xorl %eax, %ecx
+; X32-NEXT:    andl %edx, %ecx
+; X32-NEXT:    xorl %eax, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %eax, 28(%edx)
+; X32-NEXT:    movl %ecx, 24(%edx)
+; X32-NEXT:    movl %edi, 20(%edx)
+; X32-NEXT:    movl %ebp, 16(%edx)
+; X32-NEXT:    movl %ebx, 12(%edx)
+; X32-NEXT:    movl %esi, 8(%edx)
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, 4(%edx)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, (%edx)
+; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    addl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl $4
+;
+; X32-NOCMOV-LABEL: test_ctselect_v8i32_avx:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    subl $8, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 28
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    andl $1, %edx
+; X32-NOCMOV-NEXT:    negl %edx
+; X32-NOCMOV-NEXT:    andl %edx, %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %ecx
+; X32-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl %esi, %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %esi, %eax
+; X32-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    xorl %ebx, %esi
+; X32-NOCMOV-NEXT:    andl %edx, %esi
+; X32-NOCMOV-NEXT:    xorl %ebx, %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    andl %edx, %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    xorl %edi, %ebp
+; X32-NOCMOV-NEXT:    andl %edx, %ebp
+; X32-NOCMOV-NEXT:    xorl %edi, %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    xorl %eax, %edi
+; X32-NOCMOV-NEXT:    andl %edx, %edi
+; X32-NOCMOV-NEXT:    xorl %eax, %edi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %ecx
+; X32-NOCMOV-NEXT:    andl %edx, %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl %eax, 28(%edx)
+; X32-NOCMOV-NEXT:    movl %ecx, 24(%edx)
+; X32-NOCMOV-NEXT:    movl %edi, 20(%edx)
+; X32-NOCMOV-NEXT:    movl %ebp, 16(%edx)
+; X32-NOCMOV-NEXT:    movl %ebx, 12(%edx)
+; X32-NOCMOV-NEXT:    movl %esi, 8(%edx)
+; X32-NOCMOV-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NOCMOV-NEXT:    movl %eax, 4(%edx)
+; X32-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NOCMOV-NEXT:    movl %eax, (%edx)
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    addl $8, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl $4
+  %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %result
+}
+
+define float @test_ctselect_f32_nan_inf(i1 %cond) {
+; X64-LABEL: test_ctselect_f32_nan_inf:
+; X64:       # %bb.0:
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    negl %edi
+; X64-NEXT:    andl $4194304, %edi # imm = 0x400000
+; X64-NEXT:    xorl $2139095040, %edi # imm = 0x7F800000
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f32_nan_inf:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl $4194304, %eax # imm = 0x400000
+; X32-NEXT:    xorl $2139095040, %eax # imm = 0x7F800000
+; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_f32_nan_inf:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl $4194304, %eax # imm = 0x400000
+; X32-NOCMOV-NEXT:    xorl $2139095040, %eax # imm = 0x7F800000
+; X32-NOCMOV-NEXT:    movl %eax, (%esp)
+; X32-NOCMOV-NEXT:    flds (%esp)
+; X32-NOCMOV-NEXT:    popl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
+  ret float %result
+}
+
+define double @test_ctselect_f64_nan_inf(i1 %cond) {
+; X64-LABEL: test_ctselect_f64_nan_inf:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    negq %rdi
+; X64-NEXT:    movabsq $2251799813685248, %rax # imm = 0x8000000000000
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    movq %rcx, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f64_nan_inf:
+; X32:       # %bb.0:
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    negl %eax
+; X32-NEXT:    andl $524288, %eax # imm = 0x80000
+; X32-NEXT:    orl $2146435072, %eax # imm = 0x7FF00000
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $0, (%esp)
+; X32-NEXT:    fldl (%esp)
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_f64_nan_inf:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    subl $12, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    negl %eax
+; X32-NOCMOV-NEXT:    andl $524288, %eax # imm = 0x80000
+; X32-NOCMOV-NEXT:    orl $2146435072, %eax # imm = 0x7FF00000
+; X32-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    movl $0, (%esp)
+; X32-NOCMOV-NEXT:    fldl (%esp)
+; X32-NOCMOV-NEXT:    addl $12, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000)
+  ret double %result
+}
+
 ; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
 declare i8 @llvm.ct.select.i8(i1, i8, i8)
 declare i16 @llvm.ct.select.i16(i1, i16, i16)
 declare i32 @llvm.ct.select.i32(i1, i32, i32)
@@ -777,3 +1310,12 @@ declare i64 @llvm.ct.select.i64(i1, i64, i64)
 declare float @llvm.ct.select.f32(i1, float, float)
 declare double @llvm.ct.select.f64(i1, double, double)
 declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
+
+; Vector intrinsics
+declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
+declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
+declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)

>From bba396ea3b42769a5a968cb59dbee680903e0d73 Mon Sep 17 00:00:00 2001
From: Akshay K <iit.akshay at gmail.com>
Date: Wed, 11 Feb 2026 19:23:51 -0500
Subject: [PATCH 4/6] [LangRef][ConstantTime] Add documentation for
 llvm.ct.select.* constant-time intrinsics (#181042)

This PR introduces and documents the llvm.ct.select.* constant-time
intrinsics, providing timing-independent selection operations for
security-sensitive code. The LangRef is updated with syntax, semantics,
supported types, and usage guidance.

Additionally, test coverage is extended with a new <8 x float> variant
(llvm.ct.select.v8f32) and corresponding X86 codegen tests to ensure
correct lowering on both x64 and x32 targets.
---
 llvm/docs/LangRef.rst                         | 132 ++++++++++++
 llvm/include/llvm/CodeGen/TargetLowering.h    |  10 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  11 +-
 llvm/test/CodeGen/X86/ctselect.ll             | 188 ++++++++++++++++++
 4 files changed, 331 insertions(+), 10 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 00a4a00c5bf95..9ce5e0948d9bb 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15904,6 +15904,138 @@ Example:
         call void @llvm.call.preallocated.teardown(token %cs)
         ret void
 
+Constant-Time Intrinsics
+-------------------------
+
+These intrinsics are provided to support constant-time operations for
+security-sensitive code. Constant-time operations execute in time independent
+of secret data values, preventing timing side-channel leaks.
+
+.. _int_ct_select:
+
+'``llvm.ct.select.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.ct.select`` on any
+integer or floating-point type, pointer types, or vectors types.
+
+::
+
+      declare i32 @llvm.ct.select.i32(i1 <cond>, i32 <val1>, i32 <val2>)
+      declare i64 @llvm.ct.select.i64(i1 <cond>, i64 <val1>, i64 <val2>)
+      declare float @llvm.ct.select.f32(i1 <cond>, float <val1>, float <val2>)
+      declare double @llvm.ct.select.f64(i1 <cond>, double <val1>, double <val2>)
+      declare ptr @llvm.ct.select.p0(i1 <cond>, ptr <val1>, ptr <val2>)
+
+      ; 128-bit vectors
+      declare <4 x i32> @llvm.ct.select.v4i32(i1 <cond>, <4 x i32> <val1>, <4 x i32> <val2>)
+      declare <2 x i64> @llvm.ct.select.v2i64(i1 <cond>, <2 x i64> <val1>, <2 x i64> <val2>)
+      declare <4 x float> @llvm.ct.select.v4f32(i1 <cond>, <4 x float> <val1>, <4 x float> <val2>)
+      declare <2 x double> @llvm.ct.select.v2f64(i1 <cond>, <2 x double> <val1>, <2 x double> <val2>)
+
+      ; 256-bit vectors
+      declare <8 x i32> @llvm.ct.select.v8i32(i1 <cond>, <8 x i32> <val1>, <8 x i32> <val2>)
+      declare <8 x float> @llvm.ct.select.v8f32(i1 <cond>, <8 x float> <val1>, <8 x float> <val2>)
+      declare <4 x double> @llvm.ct.select.v4f64(i1 <cond>, <4 x double> <val1>, <4 x double> <val2>)
+
+Overview:
+"""""""""
+
+The '``llvm.ct.select``' family of intrinsic functions selects one of two
+values based on a condition, with the guarantee that the operation executes
+in constant time. Unlike the standard :ref:`select <i_select>` instruction,
+``llvm.ct.select`` ensures that the execution time and observable behavior
+do not depend on the condition value, preventing timing-based side-channel
+leaks.
+
+Arguments:
+""""""""""
+
+The '``llvm.ct.select``' intrinsic requires three arguments:
+
+1. The condition, which must be a scalar value of type 'i1'. Unlike
+   :ref:`select <i_select>` which accepts both scalar 'i1' and vector
+   '<N x i1>' conditions, ``llvm.ct.select`` only accepts a scalar 'i1'
+   condition. Vector conditions are not supported.
+2. The first value argument of any :ref:`first class <t_firstclass>` type.
+   This can be a scalar or vector type.
+3. The second value argument, which must have the same type as the first
+   value argument.
+
+When the value arguments are vectors, the scalar condition is broadcast to
+all vector elements (i.e., all elements are selected from the same source
+vector based on the single condition).
+
+Semantics:
+""""""""""
+
+If the condition evaluates to 1, the intrinsic returns the first value
+argument; otherwise, it returns the second value argument.
+
+The key semantic difference from :ref:`select <i_select>` is the constant-time
+code generation guarantee: the intrinsic must be lowered to machine code that:
+
+* Does not introduce data-dependent control flow based on the condition value
+* Executes the same sequence of instructions regardless of the condition value
+* Computes both value arguments before performing the selection
+
+**Platform Requirements:** The constant-time guarantee is conditional on
+platform support for data-independent timing, such as ARM DIT (Data Independent
+Timing) or x86 DOIT (Data Operand Independent Timing). Without such hardware
+support, the generated code will still be free from data-dependent control flow,
+but microarchitectural timing variations may still occur.
+
+The typical implementation uses bitwise operations to blend the two values
+based on a mask derived from the condition:
+
+::
+
+      mask = sext(cond)  ; sign-extend condition to all 1s or all 0s
+      result = val2 ^ ((val1 ^ val2) & mask)
+
+Targets with native constant-time select support use target-specific
+instructions to generate optimized bitwise operations with stronger guarantees.
+Targets without native support lower the intrinsic to a sequence of generic
+bitwise operations as shown above, structured to resist pattern recognition
+and preserve the constant-time property through optimization passes.
+
+Optimizations must preserve the constant-time code generation semantics.
+Transforms that would introduce data-dependent control flow are not permitted.
+This includes converting to conditional branches, using predicated instructions
+with data-dependent timing, or optimizing away either value argument before the
+selection completes (both paths must be computed).
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      ; Constant-time integer selection
+      %x = call i32 @llvm.ct.select.i32(i1 %cond, i32 42, i32 17)
+      %key = call i64 @llvm.ct.select.i64(i1 %cond, i64 %k_a, i64 %k_b)
+
+      ; Constant-time 128-bit integer vector selection (scalar condition broadcast to all lanes)
+      %v4 = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond,
+                                                  <4 x i32> <i32 1, i32 2, i32 3, i32 4>,
+                                                  <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
+
+      ; Constant-time 256-bit integer vector selection
+      %v8 = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond,
+                                                  <8 x i32> %vec_a, <8 x i32> %vec_b)
+
+      ; Constant-time 256-bit float vector selection
+      %v8f = call <8 x float> @llvm.ct.select.v8f32(i1 %cond,
+                                                     <8 x float> %fvec_a, <8 x float> %fvec_b)
+
+      ; Constant-time float vector selection
+      %f = call float @llvm.ct.select.f32(i1 %cond, float 1.0, float 0.0)
+
+      ; Constant-time pointer selection
+      %ptr = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr_a, ptr %ptr_b)
+
 Standard C/C++ Library Intrinsics
 ---------------------------------
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index b6f445b4a53f1..be9b07c353d29 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -241,11 +241,11 @@ class LLVM_ABI TargetLoweringBase {
 
   /// Enum that describes what type of support for selects the target has.
   enum SelectSupportKind {
-    ScalarValSelect,     // The target supports scalar selects (ex: cmov).
-    ScalarCondVectorVal, // The target supports selects with a scalar condition
-                         // and vector values (ex: cmov).
-    VectorMaskSelect     // The target supports vector selects with a vector
-                         // mask (ex: x86 blends).
+    ScalarValSelect,      // The target supports scalar selects (ex: cmov).
+    ScalarCondVectorVal,  // The target supports selects with a scalar condition
+                          // and vector values (ex: cmov).
+    VectorMaskSelect      // The target supports vector selects with a vector
+                          // mask (ex: x86 blends).
   };
 
   /// Enum that specifies what an atomic load/AtomicRMWInst is expanded
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8d8fdfa5febbb..dc93b5a6534f2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -486,8 +486,9 @@ namespace {
     SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
     SDValue visitCTPOP(SDNode *N);
     SDValue visitSELECT(SDNode *N);
-    // visit CTSELECT Node
-    SDValue visitConstantTimeSelect(SDNode *N);
+    // ISD::CTSELECT - Constant-Time SELECT (not related to CT in
+    // CTPOP/CTLZ/CTTZ where CT means "count").
+    SDValue visitCT_SELECT(SDNode *N);
     SDValue visitVSELECT(SDNode *N);
     SDValue visitVP_SELECT(SDNode *N);
     SDValue visitSELECT_CC(SDNode *N);
@@ -1996,7 +1997,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::CTTZ_ZERO_UNDEF:    return visitCTTZ_ZERO_UNDEF(N);
   case ISD::CTPOP:              return visitCTPOP(N);
   case ISD::SELECT:             return visitSELECT(N);
-  case ISD::CTSELECT:           return visitConstantTimeSelect(N);
+  case ISD::CTSELECT:           return visitCT_SELECT(N);
   case ISD::VSELECT:            return visitVSELECT(N);
   case ISD::SELECT_CC:          return visitSELECT_CC(N);
   case ISD::SETCC:              return visitSETCC(N);
@@ -12474,7 +12475,7 @@ static SDValue foldBoolSelectToLogic(SDNode *N, const SDLoc &DL,
                                      SelectionDAG &DAG) {
   assert((N->getOpcode() == ISD::SELECT || N->getOpcode() == ISD::VSELECT ||
           N->getOpcode() == ISD::VP_SELECT) &&
-         "Expected a (v)(vp.)(ct) select");
+         "Expected a (v)(vp.)select");
   SDValue Cond = N->getOperand(0);
   SDValue T = N->getOperand(1), F = N->getOperand(2);
   EVT VT = N->getValueType(0);
@@ -12850,7 +12851,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
 //  - i1 CTSELECT nesting merges via AND/OR that keep the result as CTSELECT.
 // Broader rewrites should be done in target-specific lowering when stronger
 // guarantees about legality and constant-time preservation are available.
-SDValue DAGCombiner::visitConstantTimeSelect(SDNode *N) {
+SDValue DAGCombiner::visitCT_SELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
index 2b6091c880637..a970fd8933b9b 100644
--- a/llvm/test/CodeGen/X86/ctselect.ll
+++ b/llvm/test/CodeGen/X86/ctselect.ll
@@ -1210,6 +1210,193 @@ define <8 x i32> @test_ctselect_v8i32_avx(i1 %cond, <8 x i32> %a, <8 x i32> %b)
   ret <8 x i32> %result
 }
 
+define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) {
+; X64-LABEL: test_ctselect_v8f32:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %edi, %xmm4
+; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; X64-NEXT:    pslld $31, %xmm4
+; X64-NEXT:    psrad $31, %xmm4
+; X64-NEXT:    movdqa %xmm4, %xmm5
+; X64-NEXT:    pandn %xmm2, %xmm5
+; X64-NEXT:    pand %xmm4, %xmm0
+; X64-NEXT:    por %xmm5, %xmm0
+; X64-NEXT:    pand %xmm4, %xmm1
+; X64-NEXT:    pandn %xmm3, %xmm4
+; X64-NEXT:    por %xmm4, %xmm1
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_v8f32:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 28
+; X32-NEXT:    .cfi_offset %esi, -20
+; X32-NEXT:    .cfi_offset %edi, -16
+; X32-NEXT:    .cfi_offset %ebx, -12
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    xorl %eax, %ecx
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    andl $1, %edx
+; X32-NEXT:    negl %edx
+; X32-NEXT:    andl %edx, %ecx
+; X32-NEXT:    xorl %eax, %ecx
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %esi, %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl %esi, %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    xorl %ebx, %esi
+; X32-NEXT:    andl %edx, %esi
+; X32-NEXT:    xorl %ebx, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    andl %edx, %ebx
+; X32-NEXT:    xorl %ebp, %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT:    xorl %edi, %ebp
+; X32-NEXT:    andl %edx, %ebp
+; X32-NEXT:    xorl %edi, %ebp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    xorl %eax, %edi
+; X32-NEXT:    andl %edx, %edi
+; X32-NEXT:    xorl %eax, %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    xorl %eax, %ecx
+; X32-NEXT:    andl %edx, %ecx
+; X32-NEXT:    xorl %eax, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl %edx, %eax
+; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %eax, 28(%edx)
+; X32-NEXT:    movl %ecx, 24(%edx)
+; X32-NEXT:    movl %edi, 20(%edx)
+; X32-NEXT:    movl %ebp, 16(%edx)
+; X32-NEXT:    movl %ebx, 12(%edx)
+; X32-NEXT:    movl %esi, 8(%edx)
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, 4(%edx)
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, (%edx)
+; X32-NEXT:    movl %edx, %eax
+; X32-NEXT:    addl $8, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl $4
+;
+; X32-NOCMOV-LABEL: test_ctselect_v8f32:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    subl $8, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 28
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    andl $1, %edx
+; X32-NOCMOV-NEXT:    negl %edx
+; X32-NOCMOV-NEXT:    andl %edx, %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %ecx
+; X32-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl %esi, %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %esi, %eax
+; X32-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    xorl %ebx, %esi
+; X32-NOCMOV-NEXT:    andl %edx, %esi
+; X32-NOCMOV-NEXT:    xorl %ebx, %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    andl %edx, %ebx
+; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-NOCMOV-NEXT:    xorl %edi, %ebp
+; X32-NOCMOV-NEXT:    andl %edx, %ebp
+; X32-NOCMOV-NEXT:    xorl %edi, %ebp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    xorl %eax, %edi
+; X32-NOCMOV-NEXT:    andl %edx, %edi
+; X32-NOCMOV-NEXT:    xorl %eax, %edi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %ecx
+; X32-NOCMOV-NEXT:    andl %edx, %ecx
+; X32-NOCMOV-NEXT:    xorl %eax, %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl %eax, 28(%edx)
+; X32-NOCMOV-NEXT:    movl %ecx, 24(%edx)
+; X32-NOCMOV-NEXT:    movl %edi, 20(%edx)
+; X32-NOCMOV-NEXT:    movl %ebp, 16(%edx)
+; X32-NOCMOV-NEXT:    movl %ebx, 12(%edx)
+; X32-NOCMOV-NEXT:    movl %esi, 8(%edx)
+; X32-NOCMOV-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NOCMOV-NEXT:    movl %eax, 4(%edx)
+; X32-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X32-NOCMOV-NEXT:    movl %eax, (%edx)
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    addl $8, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl $4
+  %result = call <8 x float> @llvm.ct.select.v8f32(i1 %cond, <8 x float> %a, <8 x float> %b)
+  ret <8 x float> %result
+}
+
 define float @test_ctselect_f32_nan_inf(i1 %cond) {
 ; X64-LABEL: test_ctselect_f32_nan_inf:
 ; X64:       # %bb.0:
@@ -1319,3 +1506,4 @@ declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>)
 declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
 declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
 declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.ct.select.v8f32(i1, <8 x float>, <8 x float>)

>From 0cee8dc87639ff5767f324db051bfd8ebc00763f Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Thu, 19 Feb 2026 23:13:21 -0500
Subject: [PATCH 5/6] [ConstantTime] Changed CTSELECT instances to CT_SELECT

---
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  4 +--
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  2 +-
 llvm/include/llvm/CodeGen/TargetLowering.h    |  7 ++---
 .../include/llvm/Target/TargetSelectionDAG.td |  2 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 28 +++++++++----------
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  4 +--
 .../SelectionDAG/LegalizeFloatTypes.cpp       | 12 ++++----
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 14 +++++-----
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 10 +++----
 .../SelectionDAG/LegalizeTypesGeneric.cpp     |  4 +--
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 12 ++++----
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  5 ++--
 .../SelectionDAG/SelectionDAGDumper.cpp       |  2 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  2 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 --
 llvm/lib/Target/ARM/ARMISelLowering.h         |  2 --
 llvm/lib/Target/X86/X86ISelLowering.h         |  2 --
 llvm/test/CodeGen/RISCV/ctselect-fallback.ll  | 18 ++++++------
 llvm/test/CodeGen/X86/ctselect.ll             | 22 +++++++--------
 19 files changed, 74 insertions(+), 80 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 3719f262826ba..d98d75c99b825 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -795,9 +795,9 @@ enum NodeType {
   /// i1 then the high bits must conform to getBooleanContents.
   SELECT,
 
-  /// CTSELECT(Cond, TrueVal, FalseVal). Cond is i1 and the value operands must
+  /// CT_SELECT(Cond, TrueVal, FalseVal). Cond is i1 and the value operands must
   /// have the same type. Used to lower the constant-time select intrinsic.
-  CTSELECT,
+  CT_SELECT,
 
   /// Select with a vector condition (op #0) and two vector operands (ops #1
   /// and #2), returning a vector result.  All vectors have the same length.
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 2a30dff6955c4..bf23f0eca0bc5 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1413,7 +1413,7 @@ class SelectionDAG {
                       SDValue RHS, SDNodeFlags Flags = SDNodeFlags()) {
     assert(LHS.getValueType() == VT && RHS.getValueType() == VT &&
            "Cannot use select on differing types");
-    return getNode(ISD::CTSELECT, DL, VT, Cond, LHS, RHS, Flags);
+    return getNode(ISD::CT_SELECT, DL, VT, Cond, LHS, RHS, Flags);
   }
 
   /// Helper function to make it easier to build SelectCC's if you just have an
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index be9b07c353d29..f6d5578412d1e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -476,10 +476,9 @@ class LLVM_ABI TargetLoweringBase {
   MachineMemOperand::Flags
   getVPIntrinsicMemOperandFlags(const VPIntrinsic &VPIntrin) const;
 
-  virtual bool isSelectSupported(SelectSupportKind kind) const { return true; }
-
-  /// Return true if the target has custom lowering for constant-time select.
-  virtual bool isCtSelectSupported(EVT VT) const { return false; }
+  virtual bool isSelectSupported(SelectSupportKind /*kind*/) const {
+    return true;
+  }
 
   /// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
   /// using generic code in SelectionDAGBuilder.
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index e60a663addfbc..b0c85710b8e79 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -734,7 +734,7 @@ def reset_fpmode   : SDNode<"ISD::RESET_FPMODE", SDTNone, [SDNPHasChain]>;
 
 def setcc      : SDNode<"ISD::SETCC"      , SDTSetCC>;
 def select     : SDNode<"ISD::SELECT"     , SDTSelect>;
-def ctselect : SDNode<"ISD::CTSELECT", SDTCtSelect>;
+def ct_select : SDNode<"ISD::CT_SELECT", SDTCtSelect>;
 def vselect    : SDNode<"ISD::VSELECT"    , SDTVSelect>;
 def selectcc   : SDNode<"ISD::SELECT_CC"  , SDTSelectCC>;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index dc93b5a6534f2..8a549a5cbb5d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -486,7 +486,7 @@ namespace {
     SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
     SDValue visitCTPOP(SDNode *N);
     SDValue visitSELECT(SDNode *N);
-    // ISD::CTSELECT - Constant-Time SELECT (not related to CT in
+    // ISD::CT_SELECT - Constant-Time SELECT (not related to CT in
     // CTPOP/CTLZ/CTTZ where CT means "count").
     SDValue visitCT_SELECT(SDNode *N);
     SDValue visitVSELECT(SDNode *N);
@@ -1997,7 +1997,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::CTTZ_ZERO_UNDEF:    return visitCTTZ_ZERO_UNDEF(N);
   case ISD::CTPOP:              return visitCTPOP(N);
   case ISD::SELECT:             return visitSELECT(N);
-  case ISD::CTSELECT:           return visitCT_SELECT(N);
+  case ISD::CT_SELECT:           return visitCT_SELECT(N);
   case ISD::VSELECT:            return visitVSELECT(N);
   case ISD::SELECT_CC:          return visitSELECT_CC(N);
   case ISD::SETCC:              return visitSETCC(N);
@@ -12845,10 +12845,10 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   return SDValue();
 }
 
-// Keep CTSELECT combines deliberately conservative to preserve constant-time
+// Keep CT_SELECT combines deliberately conservative to preserve constant-time
 // intent across generic DAG combines. We only accept:
 //  - canonicalization of negated conditions (flip true/false operands), and
-//  - i1 CTSELECT nesting merges via AND/OR that keep the result as CTSELECT.
+//  - i1 CT_SELECT nesting merges via AND/OR that keep the result as CT_SELECT.
 // Broader rewrites should be done in target-specific lowering when stronger
 // guarantees about legality and constant-time preservation are available.
 SDValue DAGCombiner::visitCT_SELECT(SDNode *N) {
@@ -12860,52 +12860,52 @@ SDValue DAGCombiner::visitCT_SELECT(SDNode *N) {
   SDLoc DL(N);
   SDNodeFlags Flags = N->getFlags();
 
-  // ctselect (not Cond), N1, N2 -> ctselect Cond, N2, N1
+  // ct_select (not Cond), N1, N2 -> ct_select Cond, N2, N1
   // This is a CT-safe canonicalization: flip negated condition by swapping
   // arms. extractBooleanFlip only matches boolean xor-with-1, so this preserves
   // dataflow semantics and does not introduce data-dependent control flow.
   if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) {
-    SDValue SelectOp = DAG.getNode(ISD::CTSELECT, DL, VT, F, N2, N1);
+    SDValue SelectOp = DAG.getNode(ISD::CT_SELECT, DL, VT, F, N2, N1);
     SelectOp->setFlags(Flags);
     return SelectOp;
   }
 
   if (VT0 == MVT::i1) {
-    // Nested CTSELECT merging optimizations for i1 conditions.
+    // Nested CT_SELECT merging optimizations for i1 conditions.
     // These are CT-safe because:
     //   1. AND/OR are bitwise operations that execute in constant time
-    //   2. The optimization combines two sequential CTSELECTs into one,
+    //   2. The optimization combines two sequential CT_SELECTs into one,
     //   reducing the total number of constant-time operations without
     //   changing semantics
     //   3. No data-dependent branches or memory accesses are introduced
     //
-    // ctselect C0, (ctselect C1, X, Y), Y -> ctselect (C0 & C1), X, Y
+    // ct_select C0, (ct_select C1, X, Y), Y -> ct_select (C0 & C1), X, Y
     // Semantic equivalence: If C0 is true, evaluate inner select (C1 ? X :
     // Y). If C0 is false, choose Y. This is equivalent to (C0 && C1) ? X : Y.
-    if (N1->getOpcode() == ISD::CTSELECT && N1->hasOneUse()) {
+    if (N1->getOpcode() == ISD::CT_SELECT && N1->hasOneUse()) {
       SDValue N1_0 = N1->getOperand(0);
       SDValue N1_1 = N1->getOperand(1);
       SDValue N1_2 = N1->getOperand(2);
       if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
         SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
         SDValue SelectOp =
-            DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), And, N1_1, N2);
+            DAG.getNode(ISD::CT_SELECT, DL, N1.getValueType(), And, N1_1, N2);
         SelectOp->setFlags(Flags);
         return SelectOp;
       }
     }
 
-    // ctselect C0, X, (ctselect C1, X, Y) -> ctselect (C0 | C1), X, Y
+    // ct_select C0, X, (ct_select C1, X, Y) -> ct_select (C0 | C1), X, Y
     // Semantic equivalence: If C0 is true, choose X. If C0 is false, evaluate
     // inner select (C1 ? X : Y). This is equivalent to (C0 || C1) ? X : Y.
-    if (N2->getOpcode() == ISD::CTSELECT && N2->hasOneUse()) {
+    if (N2->getOpcode() == ISD::CT_SELECT && N2->hasOneUse()) {
       SDValue N2_0 = N2->getOperand(0);
       SDValue N2_1 = N2->getOperand(1);
       SDValue N2_2 = N2->getOperand(2);
       if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
         SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
         SDValue SelectOp =
-            DAG.getNode(ISD::CTSELECT, DL, N1.getValueType(), Or, N1, N2_2);
+            DAG.getNode(ISD::CT_SELECT, DL, N1.getValueType(), Or, N1, N2_2);
         SelectOp->setFlags(Flags);
         return SelectOp;
       }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 72a468af986c7..81f992678626c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4310,7 +4310,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     }
     Results.push_back(Tmp1);
     break;
-  case ISD::CTSELECT: {
+  case ISD::CT_SELECT: {
     Tmp1 = Node->getOperand(0);
     Tmp2 = Node->getOperand(1);
     Tmp3 = Node->getOperand(2);
@@ -5743,7 +5743,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     break;
   }
   case ISD::SELECT:
-  case ISD::CTSELECT: {
+  case ISD::CT_SELECT: {
     unsigned ExtOp, TruncOp;
     if (Node->getValueType(0).isVector() ||
         Node->getValueType(0).getSizeInBits() == NVT.getSizeInBits()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index aa2f43d3c4dc0..424051a40594f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -161,7 +161,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::ATOMIC_LOAD: R = SoftenFloatRes_ATOMIC_LOAD(N); break;
     case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
     case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
-    case ISD::CTSELECT:    R = SoftenFloatRes_CTSELECT(N); break;
+    case ISD::CT_SELECT:    R = SoftenFloatRes_CT_SELECT(N); break;
     case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
     case ISD::FREEZE:      R = SoftenFloatRes_FREEZE(N); break;
     case ISD::STRICT_SINT_TO_FP:
@@ -1061,7 +1061,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
                        LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_CTSELECT(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_CT_SELECT(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(1));
   SDValue RHS = GetSoftenedFloat(N->getOperand(2));
   return DAG.getCTSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS,
@@ -1588,7 +1588,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::POISON:
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
-  case ISD::CTSELECT:     SplitRes_CTSELECT(N, Lo, Hi); break;
+  case ISD::CT_SELECT:     SplitRes_CT_SELECT(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
 
   case ISD::MERGE_VALUES:       ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
@@ -2767,8 +2767,8 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
     R = SoftPromoteHalfRes_ATOMIC_LOAD(N);
     break;
   case ISD::SELECT:      R = SoftPromoteHalfRes_SELECT(N); break;
-  case ISD::CTSELECT:
-    R = SoftPromoteHalfRes_CTSELECT(N);
+  case ISD::CT_SELECT:
+    R = SoftPromoteHalfRes_CT_SELECT(N);
     break;
   case ISD::SELECT_CC:   R = SoftPromoteHalfRes_SELECT_CC(N); break;
   case ISD::STRICT_SINT_TO_FP:
@@ -3034,7 +3034,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT(SDNode *N) {
                        Op2);
 }
 
-SDValue DAGTypeLegalizer::SoftPromoteHalfRes_CTSELECT(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_CT_SELECT(SDNode *N) {
   SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
   SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
   return DAG.getCTSelect(SDLoc(N), Op1.getValueType(), N->getOperand(0), Op1,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index eadd8520c0164..7516946258c3c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -96,7 +96,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     Res = PromoteIntRes_VECTOR_COMPRESS(N);
     break;
   case ISD::SELECT:
-  case ISD::CTSELECT:
+  case ISD::CT_SELECT:
   case ISD::VSELECT:
   case ISD::VP_SELECT:
   case ISD::VP_MERGE:
@@ -2047,8 +2047,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
     break;
   case ISD::VSELECT:
   case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
-  case ISD::CTSELECT:
-    Res = PromoteIntOp_CTSELECT(N, OpNo);
+  case ISD::CT_SELECT:
+    Res = PromoteIntOp_CT_SELECT(N, OpNo);
     break;
   case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
   case ISD::VP_SETCC:
@@ -2448,13 +2448,13 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
                                         N->getOperand(2)), 0);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntOp_CTSELECT(SDNode *N, unsigned OpNo) {
+SDValue DAGTypeLegalizer::PromoteIntOp_CT_SELECT(SDNode *N, unsigned OpNo) {
   assert(OpNo == 0 && "Only know how to promote the condition!");
   SDValue Cond = N->getOperand(0);
   EVT OpTy = N->getOperand(1).getValueType();
 
   // Promote all the way up to the canonical SetCC type.
-  EVT OpVT = N->getOpcode() == ISD::CTSELECT ? OpTy.getScalarType() : OpTy;
+  EVT OpVT = N->getOpcode() == ISD::CT_SELECT ? OpTy.getScalarType() : OpTy;
   Cond = PromoteTargetBoolean(Cond, OpVT);
 
   return SDValue(
@@ -3059,8 +3059,8 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ARITH_FENCE:  SplitRes_ARITH_FENCE(N, Lo, Hi); break;
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
-  case ISD::CTSELECT:
-    SplitRes_CTSELECT(N, Lo, Hi);
+  case ISD::CT_SELECT:
+    SplitRes_CT_SELECT(N, Lo, Hi);
     break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::POISON:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 98f95f19e94cc..966ba558dc934 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -384,7 +384,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
   SDValue PromoteIntOp_ScalarOp(SDNode *N);
   SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
-  SDValue PromoteIntOp_CTSELECT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_CT_SELECT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_Shift(SDNode *N);
@@ -619,7 +619,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftenFloatRes_LOAD(SDNode *N);
   SDValue SoftenFloatRes_ATOMIC_LOAD(SDNode *N);
   SDValue SoftenFloatRes_SELECT(SDNode *N);
-  SDValue SoftenFloatRes_CTSELECT(SDNode *N);
+  SDValue SoftenFloatRes_CT_SELECT(SDNode *N);
   SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
   SDValue SoftenFloatRes_UNDEF(SDNode *N);
   SDValue SoftenFloatRes_VAARG(SDNode *N);
@@ -782,7 +782,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftPromoteHalfRes_LOAD(SDNode *N);
   SDValue SoftPromoteHalfRes_ATOMIC_LOAD(SDNode *N);
   SDValue SoftPromoteHalfRes_SELECT(SDNode *N);
-  SDValue SoftPromoteHalfRes_CTSELECT(SDNode *N);
+  SDValue SoftPromoteHalfRes_CT_SELECT(SDNode *N);
   SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N);
   SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N);
   SDValue SoftPromoteHalfRes_FABS(SDNode *N);
@@ -847,7 +847,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_VSELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT(SDNode *N);
-  SDValue ScalarizeVecRes_CTSELECT(SDNode *N);
+  SDValue ScalarizeVecRes_CT_SELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT_CC(SDNode *N);
   SDValue ScalarizeVecRes_SETCC(SDNode *N);
   SDValue ScalarizeVecRes_UNDEF(SDNode *N);
@@ -1180,7 +1180,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_AssertSext(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_ARITH_FENCE (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi);
-  void SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_CT_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_FREEZE      (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 00090cf5c57ee..8a6f9d5c197ca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -569,9 +569,9 @@ void DAGTypeLegalizer::SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi) {
   Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH, EVLHi);
 }
 
-void DAGTypeLegalizer::SplitRes_CTSELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
+void DAGTypeLegalizer::SplitRes_CT_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
   // Reuse generic select splitting to support scalar and vector conditions.
-  // SplitRes_Select rebuilds with N->getOpcode(), so CTSELECT is preserved.
+  // SplitRes_Select rebuilds with N->getOpcode(), so CT_SELECT is preserved.
   SplitRes_Select(N, Lo, Hi);
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 9c00c2fc2fc38..7cd3f87197032 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -77,8 +77,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
   case ISD::VSELECT:           R = ScalarizeVecRes_VSELECT(N); break;
   case ISD::SELECT:            R = ScalarizeVecRes_SELECT(N); break;
-  case ISD::CTSELECT:
-    R = ScalarizeVecRes_CTSELECT(N);
+  case ISD::CT_SELECT:
+    R = ScalarizeVecRes_CT_SELECT(N);
     break;
   case ISD::SELECT_CC:         R = ScalarizeVecRes_SELECT_CC(N); break;
   case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
@@ -690,7 +690,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
                        GetScalarizedVector(N->getOperand(2)));
 }
 
-SDValue DAGTypeLegalizer::ScalarizeVecRes_CTSELECT(SDNode *N) {
+SDValue DAGTypeLegalizer::ScalarizeVecRes_CT_SELECT(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(1));
   return DAG.getCTSelect(SDLoc(N), LHS.getValueType(), N->getOperand(0), LHS,
                          GetScalarizedVector(N->getOperand(2)));
@@ -1246,8 +1246,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SELECT:
   case ISD::VP_MERGE:
   case ISD::VP_SELECT:    SplitRes_Select(N, Lo, Hi); break;
-  case ISD::CTSELECT:
-    SplitRes_CTSELECT(N, Lo, Hi);
+  case ISD::CT_SELECT:
+    SplitRes_CT_SELECT(N, Lo, Hi);
     break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::POISON:
@@ -4962,7 +4962,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
   case ISD::VSELECT:
   case ISD::SELECT:
-  case ISD::CTSELECT:
+  case ISD::CT_SELECT:
   case ISD::VP_SELECT:
   case ISD::VP_MERGE:
     Res = WidenVecRes_Select(N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2a9ec7b5d26ae..156d82e96b2ae 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6873,8 +6873,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     assert(!CondVT.isVector() && "Vector type cond not supported yet");
 
     // Handle scalar types
-    if (TLI.isCtSelectSupported(VT) && !CondVT.isVector()) {
-      SDValue Result = DAG.getNode(ISD::CTSELECT, DL, VT, Cond, A, B);
+    if (TLI.isOperationLegalOrCustom(ISD::CT_SELECT, VT) &&
+        !CondVT.isVector()) {
+      SDValue Result = DAG.getNode(ISD::CT_SELECT, DL, VT, Cond, A, B);
       setValue(&I, Result);
       return;
     }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 81aa3867d7b3a..1242b5c05dc0e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -340,7 +340,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FPOWI:                      return "fpowi";
   case ISD::STRICT_FPOWI:               return "strict_fpowi";
   case ISD::SETCC:                      return "setcc";
-  case ISD::CTSELECT:                   return "ctselect";
+  case ISD::CT_SELECT:                   return "ct_select";
   case ISD::SETCCCARRY:                 return "setcccarry";
   case ISD::STRICT_FSETCC:              return "strict_fsetcc";
   case ISD::STRICT_FSETCCS:             return "strict_fsetccs";
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 5fb5e11e111bb..22bbc5297c709 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1108,7 +1108,7 @@ void TargetLoweringBase::initActions() {
                         ISD::FASIN,          ISD::FATAN,
                         ISD::FCOSH,          ISD::FSINH,
                         ISD::FTANH,          ISD::FATAN2,
-                        ISD::FMULADD},
+                        ISD::FMULADD,        ISD::CT_SELECT},
                        VT, Expand);
 
     // Overflow operations default to expand
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 3b6ab468bae4e..7f4f28ebf1ade 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -159,8 +159,6 @@ class AArch64TargetLowering : public TargetLowering {
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
 
-  bool isCtSelectSupported(EVT VT) const override { return false; }
-
   SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
   MachineBasicBlock *EmitF128CSEL(MachineInstr &MI,
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 53471bde78c43..e58d872c548e4 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -119,8 +119,6 @@ class VectorType;
       return (Kind != ScalarCondVectorVal);
     }
 
-    bool isCtSelectSupported(EVT VT) const override { return false; }
-
     bool isReadOnly(const GlobalValue *GV) const;
 
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 036cc8c5d3b43..fc16053caa705 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1108,8 +1108,6 @@ namespace llvm {
     unsigned getJumpTableEncoding() const override;
     bool useSoftFloat() const override;
 
-    bool isCtSelectSupported(EVT VT) const override { return false; }
-
     void markLibCallAttributes(MachineFunction *MF, unsigned CC,
                                ArgListTy &Args) const override;
 
diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
index c624c17d7e33e..d4617c7e75da7 100644
--- a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
+++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
@@ -203,8 +203,8 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
   ret i32 %result
 }
 
-; Test nested CTSELECT pattern with AND merging on i1 values
-; Pattern: ctselect C0, (ctselect C1, X, Y), Y -> ctselect (C0 & C1), X, Y
+; Test nested CT_SELECT pattern with AND merging on i1 values
+; Pattern: ct_select C0, (ct_select C1, X, Y), Y -> ct_select (C0 & C1), X, Y
 define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
 ; RV64-LABEL: test_ctselect_nested_and_i1_to_i32:
 ; RV64:       # %bb.0:
@@ -231,8 +231,8 @@ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
   ret i32 %result
 }
 
-; Test nested CTSELECT pattern with OR merging on i1 values
-; Pattern: ctselect C0, X, (ctselect C1, X, Y) -> ctselect (C0 | C1), X, Y
+; Test nested CT_SELECT pattern with OR merging on i1 values
+; Pattern: ct_select C0, X, (ct_select C1, X, Y) -> ct_select (C0 | C1), X, Y
 define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
 ; RV64-LABEL: test_ctselect_nested_or_i1_to_i32:
 ; RV64:       # %bb.0:
@@ -259,9 +259,9 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
   ret i32 %result
 }
 
-; Test double nested CTSELECT with recursive AND merging
-; Pattern: ctselect C0, (ctselect C1, (ctselect C2, X, Y), Y), Y
-;   -> ctselect (C0 & C1 & C2), X, Y
+; Test double nested CT_SELECT with recursive AND merging
+; Pattern: ct_select C0, (ct_select C1, (ct_select C2, X, Y), Y), Y
+;   -> ct_select (C0 & C1 & C2), X, Y
 define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) {
 ; RV64-LABEL: test_ctselect_double_nested_and_i1:
 ; RV64:       # %bb.0:
@@ -291,7 +291,7 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
   ret i32 %result
 }
 
-; Test double nested CTSELECT with mixed AND/OR patterns
+; Test double nested CT_SELECT with mixed AND/OR patterns
 define i32 @test_ctselect_double_nested_mixed_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y, i32 %z) {
 ; RV64-LABEL: test_ctselect_double_nested_mixed_i1:
 ; RV64:       # %bb.0:
@@ -329,7 +329,7 @@ define i32 @test_ctselect_double_nested_mixed_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x,
   ret i32 %result
 }
 
-; Test nested ctselect calls
+; Test nested ct_select calls
 define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
 ; RV64-LABEL: test_ctselect_nested:
 ; RV64:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
index a970fd8933b9b..bf65e04721df1 100644
--- a/llvm/test/CodeGen/X86/ctselect.ll
+++ b/llvm/test/CodeGen/X86/ctselect.ll
@@ -552,7 +552,7 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
   ret i32 %result
 }
 
-; Test nested ctselect calls
+; Test nested ct_select calls
 define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
 ; X64-LABEL: test_ctselect_nested:
 ; X64:       # %bb.0:
@@ -631,8 +631,8 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
   ret i32 %result
 }
 
-; Test nested CTSELECT pattern with AND merging on i1 values
-; Pattern: ctselect C0, (ctselect C1, X, Y), Y -> ctselect (C0 & C1), X, Y
+; Test nested CT_SELECT pattern with AND merging on i1 values
+; Pattern: ct_select C0, (ct_select C1, X, Y), Y -> ct_select (C0 & C1), X, Y
 ; This optimization only applies when selecting between i1 values (boolean logic)
 define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
 ; X64-LABEL: test_ctselect_nested_and_i1_to_i32:
@@ -679,8 +679,8 @@ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
   ret i32 %result
 }
 
-; Test nested CTSELECT pattern with OR merging on i1 values
-; Pattern: ctselect C0, X, (ctselect C1, X, Y) -> ctselect (C0 | C1), X, Y
+; Test nested CT_SELECT pattern with OR merging on i1 values
+; Pattern: ct_select C0, X, (ct_select C1, X, Y) -> ct_select (C0 | C1), X, Y
 ; This optimization only applies when selecting between i1 values (boolean logic)
 define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
 ; X64-LABEL: test_ctselect_nested_or_i1_to_i32:
@@ -727,10 +727,10 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
   ret i32 %result
 }
 
-; Test double nested CTSELECT with recursive AND merging
-; Pattern: ctselect C0, (ctselect C1, (ctselect C2, X, Y), Y), Y
-;   -> ctselect C0, (ctselect (C1 & C2), X, Y), Y
-;   -> ctselect (C0 & (C1 & C2)), X, Y
+; Test double nested CT_SELECT with recursive AND merging
+; Pattern: ct_select C0, (ct_select C1, (ct_select C2, X, Y), Y), Y
+;   -> ct_select C0, (ct_select (C1 & C2), X, Y), Y
+;   -> ct_select (C0 & (C1 & C2)), X, Y
 ; This tests that the optimization can be applied recursively
 define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) {
 ; X64-LABEL: test_ctselect_double_nested_and_i1:
@@ -781,10 +781,10 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
   ret i32 %result
 }
 
-; Vector CTSELECT Tests
+; Vector CT_SELECT Tests
 ; ============================================================================
 
-; Test vector CTSELECT with v4i32 (128-bit vector with single i1 mask)
+; Test vector CT_SELECT with v4i32 (128-bit vector with single i1 mask)
 ; NOW CONSTANT-TIME: Uses bitwise XOR/AND operations instead of branches!
 define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
 ; X64-LABEL: test_ctselect_v4i32:

>From cea17086440ee6deb04d8b2461812dc86423434b Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Sat, 7 Mar 2026 15:36:09 -0500
Subject: [PATCH 6/6] [ConstantTime] Fix CT_SELECT expansion to preserve
 constant-time guarantees

Create CT_SELECT nodes for scalar types regardless of target support, so
they survive DAGCombiner (visitCT_SELECT is conservative). Expand to
AND/OR/XOR during operation legalization after SETCC is lowered, preventing
the sext(setcc)->select fold chain that converts constant-time patterns
into data-dependent conditional moves (e.g. movn/movz on MIPS).

The mask uses SUB(0, AND(Cond, 1)) instead of SIGN_EXTEND because type
legalization already promoted i1 to the SetCC result type, making
SIGN_EXTEND a no-op for same-width types.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  20 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  16 +-
 llvm/test/CodeGen/RISCV/ctselect-fallback.ll  |  22 +-
 llvm/test/CodeGen/X86/ctselect.ll             | 259 ++++++++++--------
 4 files changed, 179 insertions(+), 138 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 81f992678626c..74446c8742e62 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4369,14 +4369,18 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                                 Node->getFlags()));
     } else {
       assert(VT.isInteger());
-      EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
-      auto [Tmp2Lo, Tmp2Hi] = DAG.SplitScalar(Tmp2, dl, HalfVT, HalfVT);
-      auto [Tmp3Lo, Tmp3Hi] = DAG.SplitScalar(Tmp3, dl, HalfVT, HalfVT);
-      SDValue ResLo =
-          DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Lo, Tmp3Lo, Node->getFlags());
-      SDValue ResHi =
-          DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Hi, Tmp3Hi, Node->getFlags());
-      Tmp1 = DAG.getNode(ISD::BUILD_PAIR, dl, VT, ResLo, ResHi);
+      // Expand: Result = F ^ ((T ^ F) & Mask), Mask = 0 - (Cond & 1).
+      // SUB+AND creates the mask because i1 is already type-promoted;
+      // SIGN_EXTEND(i32, i32) would be a no-op leaving mask as 0/1.
+      SDValue Cond = Tmp1;
+      if (Cond.getValueType() != VT)
+        Cond = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Cond);
+      SDValue Mask = DAG.getNode(
+          ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT),
+          DAG.getNode(ISD::AND, dl, VT, Cond, DAG.getConstant(1, dl, VT)));
+      SDValue Diff = DAG.getNode(ISD::XOR, dl, VT, Tmp2, Tmp3);
+      Tmp1 = DAG.getNode(ISD::XOR, dl, VT, Tmp3,
+                          DAG.getNode(ISD::AND, dl, VT, Diff, Mask));
       Tmp1->setFlags(Node->getFlags());
     }
     Results.push_back(Tmp1);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 156d82e96b2ae..d9ceacc7c98f6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6872,9 +6872,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // assert if Cond type is Vector
     assert(!CondVT.isVector() && "Vector type cond not supported yet");
 
-    // Handle scalar types
-    if (TLI.isOperationLegalOrCustom(ISD::CT_SELECT, VT) &&
-        !CondVT.isVector()) {
+    // Create a CT_SELECT node for scalar types so it survives DAGCombiner
+    // (visitCT_SELECT is conservative) and expands to AND/OR/XOR during
+    // operation legalization, after SETCC is lowered. Unsupported vectors
+    // and floats with illegal integer equivalents (e.g. f64 on i386) use
+    // the inline fallback which runs before type legalization.
+    bool CreateNode =
+        TLI.isOperationLegalOrCustom(ISD::CT_SELECT, VT) ||
+        (!VT.isVector() &&
+         (!VT.isFloatingPoint() ||
+          TLI.isTypeLegal(
+              EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()))));
+
+    if (CreateNode) {
       SDValue Result = DAG.getNode(ISD::CT_SELECT, DL, VT, Cond, A, B);
       setValue(&I, Result);
       return;
diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
index d4617c7e75da7..ee8072703ee31 100644
--- a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
+++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
@@ -101,8 +101,6 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
 ;
 ; RV32-LABEL: test_ctselect_const_true:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    xor a0, a1, a0
 ; RV32-NEXT:    ret
   %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
   ret i32 %result
@@ -208,7 +206,7 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
 define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
 ; RV64-LABEL: test_ctselect_nested_and_i1_to_i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    xor a2, a2, a3
 ; RV64-NEXT:    slli a0, a0, 63
 ; RV64-NEXT:    srai a0, a0, 63
@@ -218,7 +216,7 @@ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
 ;
 ; RV32-LABEL: test_ctselect_nested_and_i1_to_i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    and a0, a0, a1
 ; RV32-NEXT:    xor a2, a2, a3
 ; RV32-NEXT:    slli a0, a0, 31
 ; RV32-NEXT:    srai a0, a0, 31
@@ -265,8 +263,8 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
 define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) {
 ; RV64-LABEL: test_ctselect_double_nested_and_i1:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    and a1, a2, a1
-; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    and a0, a0, a2
 ; RV64-NEXT:    xor a3, a3, a4
 ; RV64-NEXT:    slli a0, a0, 63
 ; RV64-NEXT:    srai a0, a0, 63
@@ -276,8 +274,8 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
 ;
 ; RV32-LABEL: test_ctselect_double_nested_and_i1:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    and a1, a2, a1
-; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    and a0, a0, a2
 ; RV32-NEXT:    xor a3, a3, a4
 ; RV32-NEXT:    slli a0, a0, 31
 ; RV32-NEXT:    srai a0, a0, 31
@@ -295,7 +293,7 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
 define i32 @test_ctselect_double_nested_mixed_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y, i32 %z) {
 ; RV64-LABEL: test_ctselect_double_nested_mixed_i1:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    and a0, a1, a0
+; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    xor a3, a3, a4
 ; RV64-NEXT:    or a0, a0, a2
 ; RV64-NEXT:    slli a0, a0, 63
@@ -309,7 +307,7 @@ define i32 @test_ctselect_double_nested_mixed_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x,
 ;
 ; RV32-LABEL: test_ctselect_double_nested_mixed_i1:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    and a0, a1, a0
+; RV32-NEXT:    and a0, a0, a1
 ; RV32-NEXT:    xor a3, a3, a4
 ; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    slli a0, a0, 31
@@ -382,7 +380,7 @@ define float @test_ctselect_f32_nan_inf(i1 %cond) {
 ; RV32-NEXT:    srai a0, a0, 31
 ; RV32-NEXT:    and a0, a0, a1
 ; RV32-NEXT:    lui a1, 522240
-; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    ret
   %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
   ret float %result
@@ -398,7 +396,7 @@ define double @test_ctselect_f64_nan_inf(i1 %cond) {
 ; RV64-NEXT:    and a0, a0, a1
 ; RV64-NEXT:    li a1, 2047
 ; RV64-NEXT:    slli a1, a1, 52
-; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    or a0, a0, a1
 ; RV64-NEXT:    ret
 ;
 ; RV32-LABEL: test_ctselect_f64_nan_inf:
diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
index bf65e04721df1..e1abae80cef4f 100644
--- a/llvm/test/CodeGen/X86/ctselect.ll
+++ b/llvm/test/CodeGen/X86/ctselect.ll
@@ -9,8 +9,8 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
 ; X64-LABEL: test_ctselect_i8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorl %edx, %esi
 ; X64-NEXT:    andb $1, %al
+; X64-NEXT:    xorl %edx, %esi
 ; X64-NEXT:    negb %al
 ; X64-NEXT:    andb %sil, %al
 ; X64-NEXT:    xorb %dl, %al
@@ -20,10 +20,10 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
 ; X32-LABEL: test_ctselect_i8:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andb $1, %al
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    xorb %cl, %dl
-; X32-NEXT:    andb $1, %al
 ; X32-NEXT:    negb %al
 ; X32-NEXT:    andb %dl, %al
 ; X32-NEXT:    xorb %cl, %al
@@ -32,10 +32,10 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
 ; X32-NOCMOV-LABEL: test_ctselect_i8:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andb $1, %al
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    xorb %cl, %dl
-; X32-NOCMOV-NEXT:    andb $1, %al
 ; X32-NOCMOV-NEXT:    negb %al
 ; X32-NOCMOV-NEXT:    andb %dl, %al
 ; X32-NOCMOV-NEXT:    xorb %cl, %al
@@ -58,10 +58,11 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
 ; X32-LABEL: test_ctselect_i32:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andb $1, %al
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    andl %edx, %eax
 ; X32-NEXT:    xorl %ecx, %eax
@@ -70,10 +71,11 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
 ; X32-NOCMOV-LABEL: test_ctselect_i32:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andb $1, %al
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
 ; X32-NOCMOV-NEXT:    negl %eax
 ; X32-NOCMOV-NEXT:    andl %edx, %eax
 ; X32-NOCMOV-NEXT:    xorl %ecx, %eax
@@ -95,45 +97,57 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 ;
 ; X32-LABEL: test_ctselect_i64:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %esi, -8
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    andb $1, %dl
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    xorl %edx, %eax
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    negl %esi
-; X32-NEXT:    andl %esi, %eax
-; X32-NEXT:    xorl %edx, %eax
+; X32-NEXT:    xorl %esi, %eax
+; X32-NEXT:    movzbl %dl, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    andl %edi, %eax
+; X32-NEXT:    xorl %esi, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    andl %esi, %edx
+; X32-NEXT:    andl %edi, %edx
 ; X32-NEXT:    xorl %ecx, %edx
 ; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_i64:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    pushl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -8
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    andb $1, %dl
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    xorl %edx, %eax
-; X32-NOCMOV-NEXT:    andl $1, %esi
-; X32-NOCMOV-NEXT:    negl %esi
-; X32-NOCMOV-NEXT:    andl %esi, %eax
-; X32-NOCMOV-NEXT:    xorl %edx, %eax
+; X32-NOCMOV-NEXT:    xorl %esi, %eax
+; X32-NOCMOV-NEXT:    movzbl %dl, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    andl %edi, %eax
+; X32-NOCMOV-NEXT:    xorl %esi, %eax
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    andl %esi, %edx
+; X32-NOCMOV-NEXT:    andl %edi, %edx
 ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
 ; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
@@ -155,37 +169,47 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
 ;
 ; X32-LABEL: test_ctselect_f32:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NEXT:    fstps {{[0-9]+}}(%esp)
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NEXT:    fstps (%esp)
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andb $1, %al
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    negl %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl %ecx, %eax
-; X32-NEXT:    movl %eax, (%esp)
-; X32-NEXT:    flds (%esp)
-; X32-NEXT:    popl %eax
+; X32-NEXT:    movl (%esp), %edx
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    andl %eax, %edx
+; X32-NEXT:    xorl %ecx, %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $12, %esp
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_f32:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %eax
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    subl $12, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    fstps {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    fstps (%esp)
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andb $1, %al
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
 ; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
-; X32-NOCMOV-NEXT:    movl %eax, (%esp)
-; X32-NOCMOV-NEXT:    flds (%esp)
-; X32-NOCMOV-NEXT:    popl %eax
+; X32-NOCMOV-NEXT:    movl (%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    andl %eax, %edx
+; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    addl $12, %esp
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
@@ -281,10 +305,11 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
 ; X32-LABEL: test_ctselect_ptr:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andb $1, %al
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    andl %edx, %eax
 ; X32-NEXT:    xorl %ecx, %eax
@@ -293,10 +318,11 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
 ; X32-NOCMOV-LABEL: test_ctselect_ptr:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andb $1, %al
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
 ; X32-NOCMOV-NEXT:    negl %eax
 ; X32-NOCMOV-NEXT:    andl %edx, %eax
 ; X32-NOCMOV-NEXT:    xorl %ecx, %eax
@@ -310,24 +336,16 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_const_true:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorl %esi, %eax
-; X64-NEXT:    xorl %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_const_true:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    xorl %ecx, %eax
-; X32-NEXT:    xorl %ecx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_const_true:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
   ret i32 %result
@@ -341,14 +359,12 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
 ;
 ; X32-LABEL: test_ctselect_const_false:
 ; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_const_false:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
   ret i32 %result
@@ -443,19 +459,20 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
 define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
 ; X64-LABEL: test_ctselect_fcmp_oeq:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm3, %eax
 ; X64-NEXT:    cmpeqss %xmm1, %xmm0
-; X64-NEXT:    pxor %xmm3, %xmm2
-; X64-NEXT:    pand %xmm0, %xmm2
-; X64-NEXT:    movd %xmm2, %ecx
-; X64-NEXT:    xorl %eax, %ecx
-; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    xorps %xmm3, %xmm2
+; X64-NEXT:    andps %xmm2, %xmm0
+; X64-NEXT:    xorps %xmm3, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_fcmp_oeq:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NEXT:    fstps {{[0-9]+}}(%esp)
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NEXT:    fstps (%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NEXT:    flds {{[0-9]+}}(%esp)
@@ -466,20 +483,24 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
 ; X32-NEXT:    andb %cl, %dl
 ; X32-NEXT:    movzbl %dl, %ecx
 ; X32-NEXT:    negl %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl (%esp), %edx
 ; X32-NEXT:    xorl %eax, %edx
 ; X32-NEXT:    andl %ecx, %edx
 ; X32-NEXT:    xorl %eax, %edx
-; X32-NEXT:    movl %edx, (%esp)
-; X32-NEXT:    flds (%esp)
-; X32-NEXT:    popl %eax
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $12, %esp
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %eax
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    subl $12, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    fstps {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    fstps (%esp)
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
@@ -492,13 +513,13 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
 ; X32-NOCMOV-NEXT:    andb %al, %dl
 ; X32-NOCMOV-NEXT:    movzbl %dl, %eax
 ; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl (%esp), %edx
 ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
 ; X32-NOCMOV-NEXT:    andl %eax, %edx
 ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    movl %edx, (%esp)
-; X32-NOCMOV-NEXT:    flds (%esp)
-; X32-NOCMOV-NEXT:    popl %eax
+; X32-NOCMOV-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    addl $12, %esp
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %cond = fcmp oeq float %x, %y
@@ -522,12 +543,13 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
 ; X32-LABEL: test_ctselect_load:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andb $1, %al
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl (%edx), %edx
 ; X32-NEXT:    movl (%ecx), %ecx
 ; X32-NEXT:    xorl %edx, %ecx
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    andl %ecx, %eax
 ; X32-NEXT:    xorl %edx, %eax
@@ -536,12 +558,13 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
 ; X32-NOCMOV-LABEL: test_ctselect_load:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andb $1, %al
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    movl (%edx), %edx
 ; X32-NOCMOV-NEXT:    movl (%ecx), %ecx
 ; X32-NOCMOV-NEXT:    xorl %edx, %ecx
-; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
 ; X32-NOCMOV-NEXT:    negl %eax
 ; X32-NOCMOV-NEXT:    andl %ecx, %eax
 ; X32-NOCMOV-NEXT:    xorl %edx, %eax
@@ -578,17 +601,19 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
 ; X32-NEXT:    .cfi_offset %esi, -12
 ; X32-NEXT:    .cfi_offset %edi, -8
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; X32-NEXT:    andb $1, %ah
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    xorl %edx, %edi
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    negl %esi
-; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    xorl %edx, %esi
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    andl %esi, %edi
 ; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    xorl %esi, %edx
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    xorl %edi, %edx
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    andl %edx, %eax
 ; X32-NEXT:    xorl %ecx, %eax
@@ -607,17 +632,19 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
 ; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
 ; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    andb $1, %al
+; X32-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; X32-NOCMOV-NEXT:    andb $1, %ah
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT:    xorl %edx, %edi
-; X32-NOCMOV-NEXT:    andl $1, %esi
-; X32-NOCMOV-NEXT:    negl %esi
-; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    xorl %edx, %esi
+; X32-NOCMOV-NEXT:    movzbl %ah, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    andl %esi, %edi
 ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    xorl %esi, %edx
-; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    xorl %edi, %edx
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
 ; X32-NOCMOV-NEXT:    negl %eax
 ; X32-NOCMOV-NEXT:    andl %edx, %eax
 ; X32-NOCMOV-NEXT:    xorl %ecx, %eax
@@ -651,10 +678,10 @@ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    andb $1, %al
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    andl %edx, %eax
 ; X32-NEXT:    xorl %ecx, %eax
@@ -665,10 +692,10 @@ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT:    movzbl %al, %eax
+; X32-NOCMOV-NEXT:    andb $1, %al
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
 ; X32-NOCMOV-NEXT:    negl %eax
 ; X32-NOCMOV-NEXT:    andl %edx, %eax
 ; X32-NOCMOV-NEXT:    xorl %ecx, %eax
@@ -699,10 +726,10 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    orb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    andb $1, %al
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    andl %edx, %eax
 ; X32-NEXT:    xorl %ecx, %eax
@@ -713,10 +740,10 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    orb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT:    movzbl %al, %eax
+; X32-NOCMOV-NEXT:    andb $1, %al
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
 ; X32-NOCMOV-NEXT:    negl %eax
 ; X32-NOCMOV-NEXT:    andl %edx, %eax
 ; X32-NOCMOV-NEXT:    xorl %ecx, %eax
@@ -735,9 +762,9 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
 define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) {
 ; X64-LABEL: test_ctselect_double_nested_and_i1:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl %esi, %eax
 ; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    xorl %r8d, %ecx
 ; X64-NEXT:    andl $1, %eax
 ; X64-NEXT:    negl %eax
@@ -751,10 +778,10 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
 ; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    andb $1, %al
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    andl %edx, %eax
 ; X32-NEXT:    xorl %ecx, %eax
@@ -766,10 +793,10 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
 ; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT:    movzbl %al, %eax
+; X32-NOCMOV-NEXT:    andb $1, %al
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
 ; X32-NOCMOV-NEXT:    negl %eax
 ; X32-NOCMOV-NEXT:    andl %edx, %eax
 ; X32-NOCMOV-NEXT:    xorl %ecx, %eax
@@ -1403,7 +1430,7 @@ define float @test_ctselect_f32_nan_inf(i1 %cond) {
 ; X64-NEXT:    andl $1, %edi
 ; X64-NEXT:    negl %edi
 ; X64-NEXT:    andl $4194304, %edi # imm = 0x400000
-; X64-NEXT:    xorl $2139095040, %edi # imm = 0x7F800000
+; X64-NEXT:    orl $2139095040, %edi # imm = 0x7F800000
 ; X64-NEXT:    movd %edi, %xmm0
 ; X64-NEXT:    retq
 ;
@@ -1412,10 +1439,11 @@ define float @test_ctselect_f32_nan_inf(i1 %cond) {
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    negl %eax
 ; X32-NEXT:    andl $4194304, %eax # imm = 0x400000
-; X32-NEXT:    xorl $2139095040, %eax # imm = 0x7F800000
+; X32-NEXT:    orl $2139095040, %eax # imm = 0x7F800000
 ; X32-NEXT:    movl %eax, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -1427,10 +1455,11 @@ define float @test_ctselect_f32_nan_inf(i1 %cond) {
 ; X32-NOCMOV-NEXT:    pushl %eax
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andl $1, %eax
+; X32-NOCMOV-NEXT:    andb $1, %al
+; X32-NOCMOV-NEXT:    movzbl %al, %eax
 ; X32-NOCMOV-NEXT:    negl %eax
 ; X32-NOCMOV-NEXT:    andl $4194304, %eax # imm = 0x400000
-; X32-NOCMOV-NEXT:    xorl $2139095040, %eax # imm = 0x7F800000
+; X32-NOCMOV-NEXT:    orl $2139095040, %eax # imm = 0x7F800000
 ; X32-NOCMOV-NEXT:    movl %eax, (%esp)
 ; X32-NOCMOV-NEXT:    flds (%esp)
 ; X32-NOCMOV-NEXT:    popl %eax
@@ -1449,7 +1478,7 @@ define double @test_ctselect_f64_nan_inf(i1 %cond) {
 ; X64-NEXT:    movabsq $2251799813685248, %rax # imm = 0x8000000000000
 ; X64-NEXT:    andq %rdi, %rax
 ; X64-NEXT:    movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
-; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    orq %rax, %rcx
 ; X64-NEXT:    movq %rcx, %xmm0
 ; X64-NEXT:    retq
 ;