[llvm-branch-commits] [llvm] [ConstantTime] Native ct.select support for X86 and i386 (PR #166704)

Sat Mar 7 13:34:29 PST 2026

https://github.com/wizardengineer updated https://github.com/llvm/llvm-project/pull/166704

>From cca6da444d2edc19dc58cd4376db2c82dce6ccc8 Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 17:09:23 -0500
Subject: [PATCH 1/2] [LLVM][X86] Add native ct.select support for X86 and i386

Add native X86 implementation with CMOV instructions and comprehensive tests:
- X86 ISelLowering with CMOV for x86_64 and i386
- Fallback bitwise operations for i386 targets without CMOV
- Post-RA expansion for pseudo-instructions
- Comprehensive test coverage:
  - Edge cases (zero conditions, large integers)
  - i386-specific tests (FP, MMX, non-CMOV fallback)
  - Vector operations
  - Optimization patterns

The basic test demonstrating fallback is in the core infrastructure PR.
---
 llvm/lib/Target/X86/X86.td                    |    8 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  791 ++++-
 llvm/lib/Target/X86/X86ISelLowering.h         |    7 +
 llvm/lib/Target/X86/X86InstrCMovSetCC.td      |  205 ++
 llvm/lib/Target/X86/X86InstrCompiler.td       |   81 +
 llvm/lib/Target/X86/X86InstrFragments.td      |    5 +
 llvm/lib/Target/X86/X86InstrInfo.cpp          |  609 +++-
 llvm/lib/Target/X86/X86InstrInfo.h            |    6 +
 llvm/lib/Target/X86/X86InstrPredicates.td     |    5 +
 llvm/lib/Target/X86/X86TargetMachine.cpp      |    5 +-
 llvm/test/CodeGen/X86/ctselect-edge-cases.ll  |  409 +++
 llvm/test/CodeGen/X86/ctselect-i386-fp.ll     |  722 ++++
 llvm/test/CodeGen/X86/ctselect-i386-mmx.ll    |  428 +++
 llvm/test/CodeGen/X86/ctselect-i386.ll        |  267 ++
 .../test/CodeGen/X86/ctselect-optimization.ll |  304 ++
 llvm/test/CodeGen/X86/ctselect-vector.ll      | 1274 +++++++
 llvm/test/CodeGen/X86/ctselect.ll             | 1825 ++++------
 nasty-fix-constant.patch                      | 2994 +++++++++++++++++
 18 files changed, 8700 insertions(+), 1245 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/ctselect-edge-cases.ll
 create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-fp.ll
 create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
 create mode 100644 llvm/test/CodeGen/X86/ctselect-i386.ll
 create mode 100644 llvm/test/CodeGen/X86/ctselect-optimization.ll
 create mode 100644 llvm/test/CodeGen/X86/ctselect-vector.ll
 create mode 100644 nasty-fix-constant.patch

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index eca763735c315..755a7070d84d3 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -841,9 +841,10 @@ include "X86SchedSapphireRapids.td"
 
 def ProcessorFeatures {
   // x86-64 micro-architecture levels: x86-64 and x86-64-v[234]
-  list<SubtargetFeature> X86_64V1Features = [
-    FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2,
-    FeatureFXSR, FeatureNOPL, FeatureX86_64,
+  list<SubtargetFeature> X86_64V1Features = [FeatureX87, FeatureCX8,
+                                             FeatureCMOV, FeatureMMX,
+                                             FeatureSSE2, FeatureFXSR,
+                                             FeatureNOPL, FeatureX86_64,
   ];
   list<SubtargetFeature> X86_64V1Tuning = [
     TuningMacroFusion,
@@ -1179,6 +1180,7 @@ def ProcessorFeatures {
                                                   FeatureAVXNECONVERT,
                                                   FeatureAVXVNNIINT8,
                                                   FeatureAVXVNNIINT16,
+                                                  FeatureUSERMSR,
                                                   FeatureSHA512,
                                                   FeatureSM3,
                                                   FeatureEGPR,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1ebfd5defdc40..401c1953323f4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86ISelLowering.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86ShuffleDecode.h"
 #include "X86.h"
 #include "X86FrameLowering.h"
@@ -30,6 +31,8 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -49,6 +52,7 @@
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
@@ -489,6 +493,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // X86 wants to expand cmov itself.
   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
     setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::CT_SELECT, VT, Custom);
     setOperationAction(ISD::SETCC, VT, Custom);
     setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
@@ -497,6 +502,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::CT_SELECT, VT, Custom);
     setOperationAction(ISD::SETCC,  VT, Custom);
   }
 
@@ -504,6 +510,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
+  setOperationAction(ISD::CT_SELECT, MVT::x86mmx, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
 
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
@@ -633,6 +640,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::BR_CC, VT, Action);
     setOperationAction(ISD::SETCC, VT, Action);
     setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::CT_SELECT, VT, Custom);
     setOperationAction(ISD::SELECT_CC, VT, Action);
     setOperationAction(ISD::FROUND, VT, Action);
     setOperationAction(ISD::FROUNDEVEN, VT, Action);
@@ -1079,6 +1087,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v4f32, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom);
 
     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
@@ -1247,6 +1256,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SELECT,             MVT::v8f16, Custom);
     setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
 
+    setOperationAction(ISD::CT_SELECT, MVT::v2f64, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v2i64, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v4i32, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v8i16, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v8f16, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v16i8, Custom);
+
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Custom);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
@@ -1576,6 +1592,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
+    setOperationAction(ISD::CT_SELECT, MVT::v4f64, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v4i64, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v8i32, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v16i16, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v16f16, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v32i8, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v8f32, Custom);
+
     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
@@ -1775,6 +1799,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
 
     setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
+    setOperationAction(ISD::CT_SELECT, MVT::v1i1, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
 
@@ -1820,6 +1845,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
       setOperationAction(ISD::SETCC,            VT, Custom);
       setOperationAction(ISD::SELECT,           VT, Custom);
+      setOperationAction(ISD::CT_SELECT, VT, Custom);
       setOperationAction(ISD::TRUNCATE,         VT, Custom);
 
       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
@@ -2099,6 +2125,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
       setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::CT_SELECT, VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -2295,6 +2322,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::CT_SELECT, VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
@@ -2361,6 +2389,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::VSELECT,            VT, Legal);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::CT_SELECT, VT, Custom);
 
       setOperationAction(ISD::FNEG,               VT, Custom);
       setOperationAction(ISD::FABS,               VT, Custom);
@@ -2630,6 +2659,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::x86amx, &X86::TILERegClass);
   }
 
+  // Handle 512-bit vector CT_SELECT without AVX512 by setting them to Expand
+  // This allows type legalization to split them into smaller vectors
+  for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16,
+                  MVT::v16f32, MVT::v8f64}) {
+    setOperationAction(ISD::CT_SELECT, VT, Expand);
+  }
+
+  // Handle 256-bit vector CT_SELECT without AVX by setting them to Expand
+  // This allows type legalization to split them into 128-bit vectors
+  if (!Subtarget.hasAVX()) {
+    for (auto VT : {MVT::v4f64, MVT::v4i64, MVT::v8i32, MVT::v16i16,
+                    MVT::v16f16, MVT::v32i8, MVT::v8f32}) {
+      setOperationAction(ISD::CT_SELECT, VT, Expand);
+    }
+  }
+
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -2736,6 +2781,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::BITCAST,
                        ISD::VSELECT,
                        ISD::SELECT,
+                       ISD::CT_SELECT,
                        ISD::SHL,
                        ISD::SRA,
                        ISD::SRL,
@@ -25962,6 +26008,174 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,
   return V;
 }
 
+SDValue X86TargetLowering::LowerCT_SELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond = Op.getOperand(0); // condition
+  SDValue TrueOp = Op.getOperand(1);  // true_value
+  SDValue FalseOp = Op.getOperand(2); // false_value
+  SDLoc DL(Op);
+  MVT VT = TrueOp.getSimpleValueType();
+
+  // Special handling for i386 targets (no CMOV) - route to post-RA expansion
+  // pseudos Let standard type legalization handle i64 automatically (splits
+  // into EDX:EAX)
+
+  // Handle soft float16 by converting to integer operations
+  if (isSoftF16(VT, Subtarget)) {
+    MVT NVT = VT.changeTypeToInteger();
+    SDValue CtSelect =
+        DAG.getNode(ISD::CT_SELECT, DL, NVT, Cond, DAG.getBitcast(NVT, FalseOp),
+                    DAG.getBitcast(NVT, TrueOp));
+    return DAG.getBitcast(VT, CtSelect);
+  }
+
+  // Handle vector types
+  if (VT.isVector()) {
+    // Handle soft float16 vectors
+    if (isSoftF16(VT, Subtarget)) {
+      MVT NVT = VT.changeVectorElementTypeToInteger();
+      SDValue CtSelect = DAG.getNode(ISD::CT_SELECT, DL, NVT, Cond,
+                                     DAG.getBitcast(NVT, FalseOp),
+                                     DAG.getBitcast(NVT, TrueOp));
+      return DAG.getBitcast(VT, CtSelect);
+    }
+
+    unsigned VectorWidth = VT.getSizeInBits();
+    MVT EltVT = VT.getVectorElementType();
+
+    // 512-bit vectors without AVX512 are now handled by type legalization
+    // (Expand action) 256-bit vectors without AVX are now handled by type
+    // legalization (Expand action)
+
+    if (VectorWidth == 128 && !Subtarget.hasSSE1())
+      return SDValue();
+
+    // Handle special cases for floating point vectors
+    if (EltVT.isFloatingPoint()) {
+      // For vector floating point with AVX, use VBLENDV-style operations
+      if (Subtarget.hasAVX() && (VectorWidth == 256 || VectorWidth == 128)) {
+        // Convert to bitwise operations using the condition
+        MVT IntVT = VT.changeVectorElementTypeToInteger();
+        SDValue IntOp1 = DAG.getBitcast(IntVT, TrueOp);
+        SDValue IntOp2 = DAG.getBitcast(IntVT, FalseOp);
+
+        // Create the CT_SELECT node with integer types
+        SDValue IntResult =
+            DAG.getNode(X86ISD::CT_SELECT, DL, IntVT, IntOp2, IntOp1,
+                        DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8),
+                        EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget));
+        return DAG.getBitcast(VT, IntResult);
+      }
+    }
+
+    // For integer vectors or when we don't have advanced SIMD support,
+    // use the generic X86 CT_SELECT node which will be matched by the patterns
+    SDValue CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+    SDValue EFLAGS = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+    // Create the X86 CT_SELECT node - note operand order: true, false, cc, flags
+    return DAG.getNode(X86ISD::CT_SELECT, DL, VT, FalseOp, TrueOp, CC, EFLAGS);
+  }
+
+  // Look past (and (setcc_carry (cmp ...)), 1)
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+      isOneConstant(Cond.getOperand(1)))
+    Cond = Cond.getOperand(0);
+
+  /// Process condition flags and prepare for CT_SELECT node creation
+  auto ProcessConditionFlags =
+      [&](SDValue Cond, MVT VT, SDLoc DL, SelectionDAG &DAG,
+          const X86Subtarget &Subtarget) -> std::pair<SDValue, SDValue> {
+    SDValue CC;
+    bool AddTest = true;
+
+    unsigned CondOpcode = Cond.getOpcode();
+    if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) {
+      CC = Cond.getOperand(0);
+      SDValue Cmp = Cond.getOperand(1);
+
+      if ((isX86LogicalCmp(Cmp)) || Cmp.getOpcode() == X86ISD::BT) {
+        Cond = Cmp;
+        AddTest = false;
+      }
+    } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+               CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+               CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
+      SDValue Value;
+      X86::CondCode X86Cond;
+      std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
+      CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
+      AddTest = false;
+    }
+
+    if (AddTest) {
+      // Look past the truncate if the high bits are known zero
+      if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
+
+      // Try to match AND to BT instruction
+      if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+        X86::CondCode X86CondCode;
+        if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
+          CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
+          Cond = BT;
+          AddTest = false;
+        }
+      }
+    }
+
+    if (AddTest) {
+      CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+      Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+    }
+
+    return {CC, Cond};
+  };
+
+  // Process condition flags and prepare for CT_SELECT
+  auto [CC, ProcessedCond] =
+      ProcessConditionFlags(Cond, VT, DL, DAG, Subtarget);
+
+  // Handle i8 CT_SELECT with truncate optimization
+  if (Op.getValueType() == MVT::i8 && TrueOp.getOpcode() == ISD::TRUNCATE &&
+      FalseOp.getOpcode() == ISD::TRUNCATE) {
+    SDValue T1 = TrueOp.getOperand(0), T2 = FalseOp.getOperand(0);
+    if (T1.getValueType() == T2.getValueType() &&
+        T1.getOpcode() != ISD::CopyFromReg &&
+        T2.getOpcode() != ISD::CopyFromReg) {
+      SDValue CtSelect = DAG.getNode(X86ISD::CT_SELECT, DL, T1.getValueType(),
+                                     T2, T1, CC, ProcessedCond);
+      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect);
+    }
+  }
+
+  // Promote small integer types to avoid partial register stalls
+  // Exception: For i8 without CMOV, we can generate a shorter instruction
+  // sequence without movzx so keep it as is.
+  if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMOV()) ||
+      (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(TrueOp, Subtarget) &&
+       !X86::mayFoldLoad(FalseOp, Subtarget))) {
+    TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp);
+    FalseOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseOp);
+    SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+    SDValue CtSelect = DAG.getNode(X86ISD::CT_SELECT, DL, MVT::i32, Ops);
+    return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect);
+  }
+
+  if (isScalarFPTypeInSSEReg(VT)) {
+    MVT IntVT = (VT == MVT::f32) ? MVT::i32 : MVT::i64;
+    TrueOp = DAG.getBitcast(IntVT, TrueOp);
+    FalseOp = DAG.getBitcast(IntVT, FalseOp);
+    SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+    SDValue CtSelect = DAG.getNode(X86ISD::CT_SELECT, DL, IntVT, Ops);
+    return DAG.getBitcast(VT, CtSelect);
+  }
+
+  // Create final CT_SELECT node
+  SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+  return DAG.getNode(X86ISD::CT_SELECT, DL, Op.getValueType(), Ops,
+                     Op->getFlags());
+}
+
 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   SDValue In = Op->getOperand(0);
@@ -30251,30 +30465,65 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
                                      const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG,
                                      SDValue *Low = nullptr) {
+  unsigned NumElts = VT.getVectorNumElements();
+
   // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
   // to a vXi16 type. Do the multiplies, shift the results and pack the half
   // lane results back together.
 
   // We'll take different approaches for signed and unsigned.
-  // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
-  // words and use pmullw to calculate the full 16-bit product.
+  // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
+  // and use pmullw to calculate the full 16-bit product.
   // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
   // shift them left into the upper byte of each word. This allows us to use
   // pmulhw to calculate the full 16-bit product. This trick means we don't
   // need to sign extend the bytes to use pmullw.
-  MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+
+  MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
   SDValue Zero = DAG.getConstant(0, dl, VT);
 
-  SDValue ALo, AHi, BLo, BHi;
+  SDValue ALo, AHi;
   if (IsSigned) {
     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
-    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
-    BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
   } else {
     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
-    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
+  }
+
+  SDValue BLo, BHi;
+  if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
+    // If the RHS is a constant, manually unpackl/unpackh and extend.
+    SmallVector<SDValue, 16> LoOps, HiOps;
+    for (unsigned i = 0; i != NumElts; i += 16) {
+      for (unsigned j = 0; j != 8; ++j) {
+        SDValue LoOp = B.getOperand(i + j);
+        SDValue HiOp = B.getOperand(i + j + 8);
+
+        if (IsSigned) {
+          LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
+          HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
+          LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
+                             DAG.getConstant(8, dl, MVT::i16));
+          HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
+                             DAG.getConstant(8, dl, MVT::i16));
+        } else {
+          LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
+          HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
+        }
+
+        LoOps.push_back(LoOp);
+        HiOps.push_back(HiOp);
+      }
+    }
+
+    BLo = DAG.getBuildVector(ExVT, dl, LoOps);
+    BHi = DAG.getBuildVector(ExVT, dl, HiOps);
+  } else if (IsSigned) {
+    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
+    BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
+  } else {
+    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
   }
 
@@ -30287,7 +30536,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
   if (Low)
     *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
 
-  return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
+  return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
 }
 
 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
@@ -34174,6 +34423,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);
   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
+  case ISD::CT_SELECT:           return LowerCT_SELECT(Op, DAG);
   case ISD::COND_LOOP:
   case ISD::BRCOND:             return LowerConditionalBranch(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
@@ -34258,6 +34508,12 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
+bool X86TargetLowering::isSelectSupported(SelectSupportKind Kind) const {
+  if (Kind == SelectSupportKind::CtSelect) {
+    return true;
+  }
+  return TargetLoweringBase::isSelectSupported(Kind);
+}
 /// Replace a node with an illegal result type with a new node built out of
 /// custom code.
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
@@ -35797,6 +36053,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(STRICT_CMPM)
   NODE_NAME_CASE(CMPMM_SAE)
   NODE_NAME_CASE(SETCC)
+  NODE_NAME_CASE(CT_SELECT)
   NODE_NAME_CASE(SETCC_CARRY)
   NODE_NAME_CASE(FSETCC)
   NODE_NAME_CASE(FSETCCM)
@@ -38578,6 +38835,480 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
   return BB;
 }
 
+/// Helper function to emit i386 CT_SELECT with condition materialization.
+/// This converts EFLAGS-based CT_SELECT into a condition byte that can be
+/// shared across multiple operations (critical for i64 type legalization).
+///
+/// Phase 1: Materialize condition byte from EFLAGS using SETCC
+/// Phase 2: Create internal pseudo with condition byte for post-RA expansion
+///
+/// This approach ensures that when i64 is type-legalized into two i32
+/// operations, both operations share the same condition byte rather than
+/// each independently reading (and destroying) EFLAGS.
+static MachineBasicBlock *
+emitCTSelectI386WithConditionMaterialization(MachineInstr &MI,
+                                              MachineBasicBlock *BB,
+                                              unsigned InternalPseudoOpcode) {
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+  const MIMetadata MIMD(MI);
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Original pseudo operands: (outs dst), (ins src1, src2, cond)
+  Register Src1Reg = MI.getOperand(1).getReg();
+  Register Src2Reg = MI.getOperand(2).getReg();
+  X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(3).getImm());
+
+  // Get opposite condition (SETCC sets to 1 when condition is TRUE,
+  // but we want to select src1 when condition is FALSE for X86 semantics)
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+  // Step 1: Materialize condition byte from EFLAGS
+  // This is done OUTSIDE the constant-time bundle, before any EFLAGS corruption
+  Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+  BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC);
+
+  // Step 2: Create internal pseudo that takes condition byte as input
+  // This pseudo will be expanded post-RA into the actual constant-time bundle
+  // The condition byte can now be safely shared between multiple pseudos
+
+  // Internal pseudo has operands: (outs dst, tmp_byte, tmp_mask), (ins src1,
+  // src2, cond_byte)
+  Register DstReg = MI.getOperand(0).getReg();
+
+  // Create virtual registers for the temporary outputs
+  Register TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+  Register TmpMaskReg;
+
+  // Determine the register class for tmp_mask based on the data type
+  if (InternalPseudoOpcode == X86::CT_SELECT_I386_INT_GR8rr) {
+    TmpMaskReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+  } else if (InternalPseudoOpcode == X86::CT_SELECT_I386_INT_GR16rr) {
+    TmpMaskReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+  } else if (InternalPseudoOpcode == X86::CT_SELECT_I386_INT_GR32rr) {
+    TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+  } else {
+    llvm_unreachable("Unknown internal pseudo opcode");
+  }
+
+  BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode))
+      .addDef(DstReg)         // dst (output)
+      .addDef(TmpByteReg)     // tmp_byte (output)
+      .addDef(TmpMaskReg)     // tmp_mask (output)
+      .addReg(Src1Reg)        // src1 (input)
+      .addReg(Src2Reg)        // src2 (input)
+      .addReg(CondByteReg);   // pre-materialized condition byte (input)
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+// Helper structure to hold memory operand information for FP loads
+struct FPLoadMemOperands {
+  bool IsValid = false;
+  unsigned BaseReg = 0;
+  int64_t ScaleVal = 1;
+  unsigned IndexReg = 0;
+  int64_t Disp = 0;
+  unsigned SegReg = 0;
+  int FrameIndex = -1;
+  bool IsFrameIndex = false;
+  int ConstantPoolIndex = -1;
+  bool IsConstantPool = false;
+  const GlobalValue *Global = nullptr;
+  int64_t GlobalOffset = 0;
+  bool IsGlobal = false;
+};
+
+// Check if a virtual register is defined by a simple FP load instruction
+// Returns the memory operands if it's a simple load, otherwise returns invalid
+static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
+                                               MachineRegisterInfo &MRI,
+                                               unsigned ExpectedLoadOpcode) {
+  FPLoadMemOperands Result;
+
+  if (!Reg.isVirtual())
+    return Result;
+
+  MachineInstr *DefMI = MRI.getVRegDef(Reg);
+  if (!DefMI)
+    return Result;
+
+  // Check if it's the expected load opcode (e.g., LD_Fp32m, LD_Fp64m, LD_Fp80m)
+  if (DefMI->getOpcode() != ExpectedLoadOpcode)
+    return Result;
+
+  // Check that this is a simple load - not volatile, not atomic, etc.
+  // FP loads have hasSideEffects = 0 in their definition for simple loads
+  if (DefMI->hasOrderedMemoryRef())
+    return Result;
+
+  // The load should have a single def (the destination register) and memory operands
+  // Format: %reg = LD_Fpxxm <fi#N>, 1, %noreg, 0, %noreg
+  // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment
+  if (DefMI->getNumOperands() < 6)
+    return Result;
+
+  // Operand 0 is the destination, operands 1-5 are the memory reference
+  MachineOperand &BaseMO = DefMI->getOperand(1);
+  MachineOperand &ScaleMO = DefMI->getOperand(2);
+  MachineOperand &IndexMO = DefMI->getOperand(3);
+  MachineOperand &DispMO = DefMI->getOperand(4);
+  MachineOperand &SegMO = DefMI->getOperand(5);
+
+  // Check if this is a frame index load
+  if (BaseMO.isFI()) {
+    Result.IsValid = true;
+    Result.IsFrameIndex = true;
+    Result.FrameIndex = BaseMO.getIndex();
+    Result.ScaleVal = ScaleMO.getImm();
+    Result.IndexReg = IndexMO.getReg();
+    Result.Disp = DispMO.getImm();
+    Result.SegReg = SegMO.getReg();
+    return Result;
+  }
+
+  // Check if this is a constant pool load
+  // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg
+  if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
+      ScaleMO.isImm() && IndexMO.isReg() &&
+      IndexMO.getReg() == X86::NoRegister &&
+      DispMO.isCPI() && SegMO.isReg()) {
+    Result.IsValid = true;
+    Result.IsConstantPool = true;
+    Result.ConstantPoolIndex = DispMO.getIndex();
+    Result.ScaleVal = ScaleMO.getImm();
+    Result.IndexReg = IndexMO.getReg();
+    Result.Disp = 0;
+    Result.SegReg = SegMO.getReg();
+    return Result;
+  }
+
+  // Check if this is a global variable load
+  // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg
+  if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
+      ScaleMO.isImm() && IndexMO.isReg() &&
+      IndexMO.getReg() == X86::NoRegister &&
+      DispMO.isGlobal() && SegMO.isReg()) {
+    Result.IsValid = true;
+    Result.IsGlobal = true;
+    Result.Global = DispMO.getGlobal();
+    Result.GlobalOffset = DispMO.getOffset();
+    Result.ScaleVal = ScaleMO.getImm();
+    Result.IndexReg = IndexMO.getReg();
+    Result.Disp = 0;
+    Result.SegReg = SegMO.getReg();
+    return Result;
+  }
+
+  // Regular memory operands (e.g., pointer loads)
+  if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() &&
+      DispMO.isImm() && SegMO.isReg()) {
+    Result.IsValid = true;
+    Result.IsFrameIndex = false;
+    Result.IsConstantPool = false;
+    Result.BaseReg = BaseMO.getReg();
+    Result.ScaleVal = ScaleMO.getImm();
+    Result.IndexReg = IndexMO.getReg();
+    Result.Disp = DispMO.getImm();
+    Result.SegReg = SegMO.getReg();
+    return Result;
+  }
+
+  return Result;
+}
+
+static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
+                                                     MachineBasicBlock *BB,
+                                                     unsigned pseudoInstr) {
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+  const MIMetadata MIMD(MI);
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  unsigned RegSizeInByte = 4;
+
+  // Get operands
+  // MI operands: %result:rfp80 = CT_SELECT_I386 %false:rfp80, %true:rfp80, %cond:i8imm
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned FalseReg = MI.getOperand(1).getReg();
+  unsigned TrueReg = MI.getOperand(2).getReg();
+  X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(3).getImm());
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+  // Materialize condition byte from EFLAGS
+  Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+  BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC);
+
+  auto storeFpToSlot = [&](unsigned Opcode, int Slot, Register Reg) {
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(Opcode)), Slot)
+        .addReg(Reg, RegState::Kill);
+  };
+
+  // Helper to load integer from memory operands
+  auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps,
+                                     unsigned Offset) -> unsigned {
+    unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg);
+
+    if (MemOps.IsFrameIndex) {
+      // Frame index: addFrameIndex + scale + index + disp + segment
+      MIB.addFrameIndex(MemOps.FrameIndex)
+          .addImm(MemOps.ScaleVal)
+          .addReg(MemOps.IndexReg)
+          .addImm(MemOps.Disp + Offset)
+          .addReg(MemOps.SegReg);
+    } else if (MemOps.IsConstantPool) {
+      // Constant pool: base_reg + scale + index + CP_index + segment
+      // MOV32rm format: base, scale, index, displacement, segment
+      MIB.addReg(X86::NoRegister)  // Base register
+          .addImm(MemOps.ScaleVal)  // Scale
+          .addReg(MemOps.IndexReg)  // Index register
+          .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset)  // Displacement (CP index)
+          .addReg(MemOps.SegReg);  // Segment
+    } else if (MemOps.IsGlobal) {
+      // Global variable: base_reg + scale + index + global + segment
+      // MOV32rm format: base, scale, index, displacement, segment
+      MIB.addReg(X86::NoRegister)  // Base register
+          .addImm(MemOps.ScaleVal)  // Scale
+          .addReg(MemOps.IndexReg)  // Index register
+          .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset)  // Displacement (global address)
+          .addReg(MemOps.SegReg);  // Segment
+    } else {
+      // Regular memory: base_reg + scale + index + disp + segment
+      MIB.addReg(MemOps.BaseReg)
+          .addImm(MemOps.ScaleVal)
+          .addReg(MemOps.IndexReg)
+          .addImm(MemOps.Disp + Offset)
+          .addReg(MemOps.SegReg);
+    }
+
+    return IntReg;
+  };
+
+  // Optimized path: load integers directly from memory when both operands are
+  // memory loads, avoiding FP register round-trip
+  auto emitCtSelectFromMemory = [&](unsigned NumValues,
+                                     const FPLoadMemOperands &TrueMemOps,
+                                     const FPLoadMemOperands &FalseMemOps,
+                                     int ResultSlot) {
+    for (unsigned Val = 0; Val < NumValues; ++Val) {
+      unsigned Offset = Val * RegSizeInByte;
+
+      // Load true and false values directly from their memory locations as integers
+      unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset);
+      unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset);
+
+      // Use CT_SELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+      unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+      unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+
+      BuildMI(*BB, MI, MIMD, TII->get(X86::CT_SELECT_I386_INT_GR32rr))
+          .addDef(ResultIntReg)    // dst (output)
+          .addDef(TmpByteReg)      // tmp_byte (output)
+          .addDef(TmpMaskReg)      // tmp_mask (output)
+          .addReg(FalseIntReg)     // src1 (input) - false value
+          .addReg(TrueIntReg)      // src2 (input) - true value
+          .addReg(CondByteReg);    // pre-materialized condition byte (input)
+
+      // Store result back to result slot
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+          .addFrameIndex(ResultSlot)
+          .addImm(1)
+          .addReg(0)
+          .addImm(Offset)
+          .addReg(0)
+          .addReg(ResultIntReg, RegState::Kill);
+    }
+  };
+
+  auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) {
+    for (unsigned Val = 0; Val < NumValues; ++Val) {
+      unsigned Offset = Val * RegSizeInByte;
+      
+      // Load true and false values from stack as 32-bit integers
+      unsigned TrueIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueIntReg)
+          .addFrameIndex(TrueSlot)
+          .addImm(1)
+          .addReg(0)
+          .addImm(Offset)
+          .addReg(0);
+
+      unsigned FalseIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), FalseIntReg)
+          .addFrameIndex(FalseSlot)
+          .addImm(1)
+          .addReg(0)
+          .addImm(Offset)
+          .addReg(0);
+
+      // Use CT_SELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+      unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+      unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      
+      BuildMI(*BB, MI, MIMD, TII->get(X86::CT_SELECT_I386_INT_GR32rr))
+          .addDef(ResultIntReg)     // dst (output)
+          .addDef(TmpByteReg)       // tmp_byte (output)
+          .addDef(TmpMaskReg)       // tmp_mask (output)
+          .addReg(FalseIntReg)      // src1 (input) - false value
+          .addReg(TrueIntReg)       // src2 (input) - true value
+          .addReg(CondByteReg);     // pre-materialized condition byte (input)
+
+      // Store result back to result slot
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+          .addFrameIndex(ResultSlot)
+          .addImm(1)
+          .addReg(0)
+          .addImm(Offset)
+          .addReg(0)
+          .addReg(ResultIntReg, RegState::Kill);
+    }
+  };
+
+  switch (pseudoInstr) {
+  case X86::CT_SELECT_I386_FP32rr: {
+    // Check if both operands are simple memory loads
+    FPLoadMemOperands TrueMemOps =
+        getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp32m);
+    FPLoadMemOperands FalseMemOps =
+        getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp32m);
+
+    int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+
+    if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+      // Optimized path: load directly from memory as integers
+      // Works for both frame index loads (stack parameters) and
+      // constant pool loads (constants)
+      emitCtSelectFromMemory(1, TrueMemOps, FalseMemOps, ResultSlot);
+
+      // Erase the original FP load instructions since we're not using them
+      // and have loaded the data directly as integers instead
+      if (MRI.hasOneUse(TrueReg)) {
+        if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+          TrueDefMI->eraseFromParent();
+      }
+      if (MRI.hasOneUse(FalseReg)) {
+        if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+          FalseDefMI->eraseFromParent();
+      }
+    } else {
+      // General path: spill FP registers to stack first
+      int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+      int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+
+      storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg);
+      storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg);
+
+      emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot);
+    }
+
+    // Load result back as f32
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg),
+                      ResultSlot);
+    break;
+  }
+  case X86::CT_SELECT_I386_FP64rr: {
+    unsigned StackSlotSize = 8;
+
+    // Check if both operands are simple memory loads
+    FPLoadMemOperands TrueMemOps =
+        getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp64m);
+    FPLoadMemOperands FalseMemOps =
+        getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp64m);
+
+    int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+
+    if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+      // Optimized path: load directly from memory as integers
+      // Works for both frame index loads (stack parameters) and
+      // constant pool loads (constants)
+      emitCtSelectFromMemory(StackSlotSize / RegSizeInByte, TrueMemOps,
+                             FalseMemOps, ResultSlot);
+
+      // Erase the original FP load instructions since we're not using them
+      if (MRI.hasOneUse(TrueReg)) {
+        if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+          TrueDefMI->eraseFromParent();
+      }
+      if (MRI.hasOneUse(FalseReg)) {
+        if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+          FalseDefMI->eraseFromParent();
+      }
+    } else {
+      // General path: spill FP registers to stack first
+      int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+      int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+
+      storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg);
+      storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg);
+
+      emitCtSelectWithPseudo(StackSlotSize / RegSizeInByte, TrueSlot, FalseSlot,
+                             ResultSlot);
+    }
+
+    // Load result back as f64
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg),
+                      ResultSlot);
+    break;
+  }
+  case X86::CT_SELECT_I386_FP80rr: {
+    // f80 is 80 bits (10 bytes), but stored with 12-byte alignment
+    unsigned StackObjectSize = 12;
+
+    // Check if both operands are simple memory loads
+    FPLoadMemOperands TrueMemOps =
+        getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp80m);
+    FPLoadMemOperands FalseMemOps =
+        getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp80m);
+
+    int ResultSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+
+    if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+      // Optimized path: load directly from memory as integers
+      // Works for both frame index loads (stack parameters) and
+      // constant pool loads (constants)
+      emitCtSelectFromMemory(StackObjectSize / RegSizeInByte, TrueMemOps,
+                             FalseMemOps, ResultSlot);
+
+      // Erase the original FP load instructions since we're not using them
+      if (MRI.hasOneUse(TrueReg)) {
+        if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+          TrueDefMI->eraseFromParent();
+      }
+      if (MRI.hasOneUse(FalseReg)) {
+        if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+          FalseDefMI->eraseFromParent();
+      }
+    } else {
+      // General path: spill FP registers to stack first
+      int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+      int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+
+      storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg);
+      storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg);
+
+      emitCtSelectWithPseudo(StackObjectSize / RegSizeInByte, TrueSlot,
+                             FalseSlot, ResultSlot);
+    }
+
+    // Load result back as f80
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg),
+                      ResultSlot);
+    break;
+  }
+  default:
+    llvm_unreachable("Invalid CT_SELECT opcode");
+  }
+
+  MI.eraseFromParent();
+
+  return BB;
+}
+
 MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -38635,6 +39366,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::CMOV_VK64:
     return EmitLoweredSelect(MI, BB);
 
+  case X86::CT_SELECT_I386_GR8rr:
+    return emitCTSelectI386WithConditionMaterialization(
+        MI, BB, X86::CT_SELECT_I386_INT_GR8rr);
+
+  case X86::CT_SELECT_I386_GR16rr:
+    return emitCTSelectI386WithConditionMaterialization(
+        MI, BB, X86::CT_SELECT_I386_INT_GR16rr);
+
+  case X86::CT_SELECT_I386_GR32rr:
+    return emitCTSelectI386WithConditionMaterialization(
+        MI, BB, X86::CT_SELECT_I386_INT_GR32rr);
+
+  case X86::CT_SELECT_I386_FP32rr:
+    return emitCTSelectI386WithFpType(MI, BB, X86::CT_SELECT_I386_FP32rr);
+  case X86::CT_SELECT_I386_FP64rr:
+    return emitCTSelectI386WithFpType(MI, BB, X86::CT_SELECT_I386_FP64rr);
+  case X86::CT_SELECT_I386_FP80rr:
+    return emitCTSelectI386WithFpType(MI, BB, X86::CT_SELECT_I386_FP80rr);
+    
   case X86::FP80_ADDr:
   case X86::FP80_ADDm32: {
     // Change the floating point control register to use double extended
@@ -42653,7 +43403,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
     if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
         X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
       return SDValue();
-    Imm = llvm::rotl<uint8_t>(Imm, 4);
+    Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
     return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
                        DAG.getTargetConstant(Imm, DL, MVT::i8));
   };
@@ -45699,16 +46449,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
   }
   case X86ISD::PCMPGT:
     // icmp sgt(0, R) == ashr(R, BitWidth-1).
-    if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) {
-      // iff we only need the signbit then we can use R directly.
-      if (OriginalDemandedBits.isSignMask())
-        return TLO.CombineTo(Op, Op.getOperand(1));
-      // otherwise we just need R's signbit for the comparison.
-      APInt SignMask = APInt::getSignMask(BitWidth);
-      if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts,
-                               Known, TLO, Depth + 1))
-        return true;
-    }
+    // iff we only need the sign bit then we can use R directly.
+    if (OriginalDemandedBits.isSignMask() &&
+        ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
+      return TLO.CombineTo(Op, Op.getOperand(1));
     break;
   case X86ISD::MOVMSK: {
     SDValue Src = Op.getOperand(0);
@@ -48657,15 +49401,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
                                                            DL, DAG, Subtarget))
       return V;
 
-  // If the sign bit is known then BLENDV can be folded away.
-  if (N->getOpcode() == X86ISD::BLENDV) {
-    KnownBits KnownCond = DAG.computeKnownBits(Cond);
-    if (KnownCond.isNegative())
-      return LHS;
-    if (KnownCond.isNonNegative())
-      return RHS;
-  }
-
   if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
     SmallVector<int, 64> CondMask;
     if (createShuffleMaskFromVSELECT(CondMask, Cond,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index fc16053caa705..c8d8f19e5cced 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -118,6 +118,10 @@ namespace llvm {
     /// X86 Select
     SELECTS,
 
+    /// X86 Constant-time Select, implemented with CMOV instruction. This is
+    /// used to implement constant-time select.
+    CT_SELECT,
+
     // Same as SETCC except it's materialized with a sbb and the value is all
     // one's or all zero's.
     SETCC_CARRY, // R = carry_bit ? ~0 : 0
@@ -1173,6 +1177,8 @@ namespace llvm {
     ///
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
+    bool isSelectSupported(SelectSupportKind Kind) const override;
+
     /// Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
@@ -1803,6 +1809,7 @@ namespace llvm {
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerCT_SELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConditionalBranch(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 77a9c7a1f585f..6081be4a30e26 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -106,6 +106,211 @@ let Predicates = [HasCMOV, HasNDD] in {
   def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
             (CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
 }
+
+// Create pseudo instruction and do the pattern matching to them.
+// We use a machine pass to lower these pseudos into cmov, in order
+// to avoid backend optimizations
+let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in {
+
+  multiclass CT_SELECT<X86TypeInfo t> {
+    // register-only
+    let isCommutable = 0, SchedRW = [WriteCMOV], Predicates = [HasNativeCMOV],
+        AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
+      def rr : PseudoI<(outs t.RegClass:$dst),
+                       (ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond),
+                       [(set t.RegClass:$dst, (X86ct_select t.RegClass:$src1, t.RegClass:$src2, timm:$cond, EFLAGS))]>;
+    }
+
+    // register-memory
+    let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], Predicates = [HasNativeCMOV],
+        AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
+      def rm : PseudoI<(outs t.RegClass:$dst),
+                       (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond),
+                       [(set t.RegClass:$dst, (X86ct_select t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>;
+    }
+  }
+}
+
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+  let Constraints = "$dst = $src1" in {
+    defm CT_SELECT16 : CT_SELECT<Xi16>;
+    defm CT_SELECT32 : CT_SELECT<Xi32>;
+    defm CT_SELECT64 : CT_SELECT<Xi64>;
+  }
+}
+
+// CT_SELECT_VEC base class
+class CT_SELECT_VEC<RegisterClass VRc, RegisterClass GRc>
+    : PseudoI<
+        (outs VRc:$dst, VRc:$tmpx, GRc:$tmpg),
+        (ins  VRc:$t,   VRc:$f,   i8imm:$cond),
+        []
+      > {
+  let Uses            = [EFLAGS];
+  let isPseudo        = 1;
+  let isNotDuplicable = 1;
+  let hasSideEffects  = 1;
+  let AsmString       = "ctselect\t$dst, $f, $t, $cond";
+  let SchedRW         = [];
+}
+
+// Width-specific class aliases
+class CT_SELECT_VEC128  : CT_SELECT_VEC<VR128,  GR32>;
+class CT_SELECT_VEC128X : CT_SELECT_VEC<VR128X, GR32>;
+class CT_SELECT_VEC256  : CT_SELECT_VEC<VR256,  GR32>;
+class CT_SELECT_VEC512  : CT_SELECT_VEC<VR512,  GR32>;
+
+
+//===----------------------------------------------------------------------===//
+// 128-bit pseudos (SSE2 baseline; we use PXOR/PAND/MOVD/PSHUFD in the expander)
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE1] in {
+
+  def CT_SELECT_V4F32 : CT_SELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+}
+
+let Predicates = [HasSSE2] in {
+
+  def CT_SELECT_V2F64 : CT_SELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V4I32 : CT_SELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V2I64 : CT_SELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V8I16 : CT_SELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V16I8 : CT_SELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+
+  // If your build has v8f16, keep this; otherwise comment it out.
+  def CT_SELECT_V8F16 : CT_SELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+}
+
+let Predicates = [HasAVX] in {
+
+  def CT_SELECT_V4F32X : CT_SELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V2F64X : CT_SELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V4I32X : CT_SELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V2I64X : CT_SELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V8I16X : CT_SELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V16I8X : CT_SELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+
+  // If your build has v8f16, keep this; otherwise comment it out.
+  def CT_SELECT_V8F16X : CT_SELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// 256-bit pseudos
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVX] in {
+
+  def CT_SELECT_V8F32  : CT_SELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V4F64  : CT_SELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V8I32  : CT_SELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V4I64  : CT_SELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V16I16 : CT_SELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CT_SELECT_V32I8  : CT_SELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+
+  // If your build has v16f16, keep this; otherwise comment it out.
+  def CT_SELECT_V16F16 : CT_SELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Selection patterns: X86ct_select(...), EFLAGS -> CT_SELECT_V*
+//
+// NOTE:
+//  * The SDNode carries Glue from CMP/TEST (due to SDNPInGlue).
+//  * We list EFLAGS explicitly in the pattern (X86 style) to model the arch read.
+//  * Temps (tmpx/tmpy,tmpg) are not in the pattern; they’re outs allocated by RA.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE1] in {
+
+  // 128-bit float (bitwise-equivalent ops in expander)
+  def : Pat<(v4f32 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V4F32 VR128:$t, VR128:$f, timm:$cc)>;
+}
+
+let Predicates = [HasSSE2] in {
+
+  // 128-bit integer
+  def : Pat<(v4i32 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V4I32 VR128:$t, VR128:$f, timm:$cc)>;
+  def : Pat<(v2i64 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V2I64 VR128:$t, VR128:$f, timm:$cc)>;
+  def : Pat<(v8i16 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V8I16 VR128:$t, VR128:$f, timm:$cc)>;
+  def : Pat<(v16i8 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V16I8 VR128:$t, VR128:$f, timm:$cc)>;
+  def : Pat<(v2f64 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V2F64 VR128:$t, VR128:$f, timm:$cc)>;
+
+  // 128-bit f16 (optional)
+  def : Pat<(v8f16 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V8F16 VR128:$t, VR128:$f, timm:$cc)>;
+}
+
+let Predicates = [HasAVX] in {
+
+  // 256-bit integer
+  def : Pat<(v8i32  (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V8I32  VR256:$t, VR256:$f, timm:$cc)>;
+  def : Pat<(v4i64  (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V4I64  VR256:$t, VR256:$f, timm:$cc)>;
+  def : Pat<(v16i16 (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V16I16 VR256:$t, VR256:$f, timm:$cc)>;
+  def : Pat<(v32i8  (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V32I8  VR256:$t, VR256:$f, timm:$cc)>;
+
+  // 256-bit float (bitwise-equivalent ops in expander)
+  def : Pat<(v8f32 (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V8F32 VR256:$t, VR256:$f, timm:$cc)>;
+  def : Pat<(v4f64 (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V4F64 VR256:$t, VR256:$f, timm:$cc)>;
+
+  // 256-bit f16 (optional)
+  def : Pat<(v16f16 (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CT_SELECT_V16F16 VR256:$t, VR256:$f, timm:$cc)>;
+}
+
 let Predicates = [HasCMOV, HasCF] in {
   def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS),
             (CFCMOV16rr GR16:$src1, (inv_cond_XFORM timm:$cond))>;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index f6fdc1cf59340..8b63c59720fcc 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -699,6 +699,87 @@ def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
 def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
           (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
 
+// CT_SELECT
+// Enhanced CT_SELECT pseudos for i386 with temporary register allocation
+// These use a two-phase approach:
+// 1. Custom inserter materializes condition byte from EFLAGS
+// 2. Post-RA expansion generates constant-time instruction bundles
+
+let isPseudo = 1, isNotDuplicable = 1 in {
+  // Phase 1: Initial pseudos that consume EFLAGS (via custom inserter)
+  // These are matched by patterns and convert EFLAGS to condition byte
+  class CT_SELECT_I386_INITIAL<RegisterClass RC, ValueType VT>
+      : PseudoI<(outs RC:$dst),
+                (ins RC:$src1, RC:$src2, i8imm:$cond),
+                [(set RC:$dst, (VT(X86ct_select RC:$src1, RC:$src2, timm:$cond,
+                                        EFLAGS)))]> {
+    let Uses = [EFLAGS];
+    let Defs = [EFLAGS];
+    let usesCustomInserter = 1;
+    let hasNoSchedulingInfo = 1;
+  }
+
+  // Phase 2: Internal pseudos with pre-materialized condition byte (post-RA expansion)
+  // These generate the actual constant-time instruction bundles
+  class CT_SELECT_I386_INTERNAL<RegisterClass RC, RegisterClass ByteRC>
+      : PseudoI<(outs RC:$dst, ByteRC:$tmp_byte, RC:$tmp_mask),
+                (ins RC:$src1, RC:$src2, ByteRC:$cond_byte), []> {
+    let hasNoSchedulingInfo = 1;
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmp_byte, at earlyclobber $tmp_mask";
+    let Defs = [EFLAGS];  // NEG instruction in post-RA expansion clobbers EFLAGS
+  }
+}
+
+// Phase 1 pseudos for non-CMOV targets (custom inserter materializes condition)
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+  let Predicates = [NoNativeCMOV] in {
+    def CT_SELECT_I386_GR8rr : CT_SELECT_I386_INITIAL<GR8, i8>;
+    def CT_SELECT_I386_GR16rr : CT_SELECT_I386_INITIAL<GR16, i16>;
+    def CT_SELECT_I386_GR32rr : CT_SELECT_I386_INITIAL<GR32, i32>;
+  }
+}
+
+// Phase 2 pseudos (post-RA expansion with pre-materialized condition byte)
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+  let Predicates = [NoNativeCMOV] in {
+    def CT_SELECT_I386_INT_GR8rr :
+        CT_SELECT_I386_INTERNAL<GR8, GR8>;
+    def CT_SELECT_I386_INT_GR16rr :
+        CT_SELECT_I386_INTERNAL<GR16, GR8>;
+    def CT_SELECT_I386_INT_GR32rr :
+        CT_SELECT_I386_INTERNAL<GR32, GR8>;
+  }
+}
+
+let hasSideEffects = 1,
+    ForceDisassemble = 1,
+    Constraints = "$dst = $src1" in {
+
+  let Predicates = [FPStackf32] in
+    def CT_SELECT_I386_FP32rr : CT_SELECT_I386_INITIAL<RFP32, f32>;
+
+  let Predicates = [FPStackf64] in
+    def CT_SELECT_I386_FP64rr : CT_SELECT_I386_INITIAL<RFP64, f64>;
+
+  def CT_SELECT_I386_FP80rr : CT_SELECT_I386_INITIAL<RFP80, f80>;
+}
+
+// Pattern matching for non-native-CMOV CT_SELECT (routes to custom inserter for condition materialization)
+// NoNativeCMOV ensures these patterns are used when actual CMOV instruction is not available
+// even if canUseCMOV() is true (e.g., i386 with SSE which can emulate CMOV)
+let Predicates = [NoNativeCMOV] in {
+  def : Pat<(i8(X86ct_select GR8:$src1, GR8:$src2, timm:$cond, EFLAGS)),
+            (CT_SELECT_I386_GR8rr GR8:$src1, GR8:$src2, timm:$cond)>;
+
+  def : Pat<(i16(X86ct_select GR16:$src1, GR16:$src2, timm:$cond, EFLAGS)),
+            (CT_SELECT_I386_GR16rr GR16:$src1, GR16:$src2, timm:$cond)>;
+
+  def : Pat<(i32(X86ct_select GR32:$src1, GR32:$src2, timm:$cond, EFLAGS)),
+            (CT_SELECT_I386_GR32rr GR32:$src1, GR32:$src2, timm:$cond)>;
+
+  // i64 patterns handled automatically by type legalization
+}
+
 //===----------------------------------------------------------------------===//
 // Normal-Instructions-With-Lock-Prefix Pseudo Instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index adbb8b821700a..7ad92e3849c9c 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -28,6 +28,10 @@ def SDTX86Cmov    : SDTypeProfile<1, 4,
                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
                                    SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
 
+def SDTX86CtSelect : SDTypeProfile<1, 4,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                   SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
 // Unary and binary operator instructions that set EFLAGS as a side-effect.
 def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
                                            [SDTCisSameAs<0, 2>,
@@ -154,6 +158,7 @@ def X86ctest   : SDNode<"X86ISD::CTEST",    SDTX86Ccmp>;
 def X86cload    : SDNode<"X86ISD::CLOAD",   SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def X86cstore   : SDNode<"X86ISD::CSTORE",  SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+def X86ct_select: SDNode<"X86ISD::CT_SELECT", SDTX86CtSelect, [SDNPInGlue]>;
 def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov>;
 def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
                         [SDNPHasChain]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 2479a8dccfb00..d4a46048a1d20 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -474,6 +474,556 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
   return false;
 }
 
+struct CtSelectInstructions {
+  unsigned PAndOpc;
+  unsigned PAndnOpc;
+  unsigned POrOpc;
+  unsigned BroadcastOpc;
+  unsigned IntMoveOpc;
+  unsigned MoveOpc;
+  bool Use256;
+  bool UseBlendInstr;
+};
+
+static CtSelectInstructions
+getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) {
+  CtSelectInstructions Instructions = {};
+
+  switch (Opcode) {
+  case X86::CT_SELECT_V2F64:
+    if (Subtarget.hasSSE2()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVAPDrr;
+      Instructions.UseBlendInstr = true;
+    } else {
+      llvm_unreachable("Double precision vectors require SSE2");
+    }
+    break;
+  case X86::CT_SELECT_V4F32:
+    if (Subtarget.hasSSE41()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVAPSrr;
+      Instructions.UseBlendInstr = true;
+    } else if (Subtarget.hasSSE2()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVAPSrr;
+    } else {
+      // fallback to SSE1, only support four 32-bit single precision
+      // floating-point values
+      Instructions.PAndOpc = X86::ANDPSrr;
+      Instructions.PAndnOpc = X86::ANDNPSrr;
+      Instructions.POrOpc = X86::ORPSrr;
+      Instructions.BroadcastOpc = X86::SHUFPSrri;
+      Instructions.IntMoveOpc = X86::MOVSS2DIrr;
+      Instructions.MoveOpc = X86::MOVAPSrr;
+    }
+    break;
+  case X86::CT_SELECT_V4I32:
+  case X86::CT_SELECT_V2I64:
+  case X86::CT_SELECT_V8I16:
+  case X86::CT_SELECT_V16I8:
+    if (Subtarget.hasSSE2()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVDQArr;
+    } else {
+      llvm_unreachable("Integer vector operations require SSE2");
+    }
+    break;
+  case X86::CT_SELECT_V8F16:
+    if (Subtarget.hasSSE2()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVDQArr;
+    } else {
+      llvm_unreachable("FP16 vector operations require SSE2");
+    }
+    break;
+  case X86::CT_SELECT_V4F32X:
+  case X86::CT_SELECT_V4I32X:
+  case X86::CT_SELECT_V2F64X:
+  case X86::CT_SELECT_V2I64X:
+  case X86::CT_SELECT_V8I16X:
+  case X86::CT_SELECT_V16I8X:
+  case X86::CT_SELECT_V8F16X:
+    if (Subtarget.hasAVX()) {
+      Instructions.PAndOpc = X86::VPANDrr;
+      Instructions.PAndnOpc = X86::VPANDNrr;
+      Instructions.POrOpc = X86::VPORrr;
+      Instructions.BroadcastOpc = X86::VPSHUFDri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc = (Opcode == X86::CT_SELECT_V4F32X) ? X86::VMOVAPSrr
+                             : (Opcode == X86::CT_SELECT_V2F64X)
+                                 ? X86::VMOVAPDrr
+                                 : X86::VMOVDQArr;
+    } else {
+      llvm_unreachable("AVX variants require AVX support");
+    }
+    break;
+  case X86::CT_SELECT_V8F32:
+  case X86::CT_SELECT_V8I32:
+    if (Subtarget.hasAVX()) {
+      Instructions.PAndOpc = X86::VPANDYrr;
+      Instructions.PAndnOpc = X86::VPANDNYrr;
+      Instructions.POrOpc = X86::VPORYrr;
+      Instructions.BroadcastOpc = X86::VPERMILPSYri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc =
+          (Opcode == X86::CT_SELECT_V8F32) ? X86::VMOVAPSYrr : X86::VMOVDQAYrr;
+      Instructions.Use256 = true;
+    } else {
+      llvm_unreachable("256-bit vectors require AVX");
+    }
+    break;
+  case X86::CT_SELECT_V4F64:
+  case X86::CT_SELECT_V4I64:
+    if (Subtarget.hasAVX()) {
+      Instructions.PAndOpc = X86::VPANDYrr;
+      Instructions.PAndnOpc = X86::VPANDNYrr;
+      Instructions.POrOpc = X86::VPORYrr;
+      Instructions.BroadcastOpc = X86::VPERMILPDYri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc =
+          (Opcode == X86::CT_SELECT_V4F64) ? X86::VMOVAPDYrr : X86::VMOVDQAYrr;
+      Instructions.Use256 = true;
+    } else {
+      llvm_unreachable("256-bit vectors require AVX");
+    }
+    break;
+  case X86::CT_SELECT_V16I16:
+  case X86::CT_SELECT_V32I8:
+  case X86::CT_SELECT_V16F16:
+    if (Subtarget.hasAVX2()) {
+      Instructions.PAndOpc = X86::VPANDYrr;
+      Instructions.PAndnOpc = X86::VPANDNYrr;
+      Instructions.POrOpc = X86::VPORYrr;
+      Instructions.BroadcastOpc = X86::VPERMILPSYri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc = X86::VMOVDQAYrr;
+      Instructions.Use256 = true;
+    } else if (Subtarget.hasAVX()) {
+      Instructions.PAndOpc = X86::VPANDYrr;
+      Instructions.PAndnOpc = X86::VPANDNYrr;
+      Instructions.POrOpc = X86::VPORYrr;
+      Instructions.BroadcastOpc = X86::VPERMILPSYri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc = X86::VMOVDQAYrr;
+      Instructions.Use256 = true;
+    } else {
+      llvm_unreachable("256-bit integer vectors require AVX");
+    }
+    break;
+  default:
+    llvm_unreachable("Unexpected CT_SELECT opcode");
+  }
+
+  return Instructions;
+}
+
+bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  const DebugLoc &DL = MI.getDebugLoc();
+  auto Instruction = getCtSelectInstructions(Opcode, Subtarget);
+
+  MachineBasicBlock *MBB = MI.getParent();
+
+  // Operand layout matches the TableGen definition:
+  // (outs VR128:$dst, VR128:$tmpx, GR32:$tmpg),
+  // (ins  VR128:$t, VR128:$f, i8imm:$cond)
+  Register Dst = MI.getOperand(0).getReg();
+  Register MaskReg = MI.getOperand(1).getReg();  // vector mask temp
+  Register TmpGPR = MI.getOperand(2).getReg();   // scalar mask temp (GPR32)
+  Register FalseVal = MI.getOperand(3).getReg(); // true_value
+  Register TrueVal = MI.getOperand(4).getReg();  // false_value
+  X86::CondCode CC = X86::CondCode(MI.getOperand(5).getImm()); // condition
+
+  MachineInstr *FirstInstr = nullptr;
+  MachineInstr *LastInstr = nullptr;
+  auto recordInstr = [&](MachineInstrBuilder MIB) {
+    MachineInstr *NewMI = MIB.getInstr();
+    LastInstr = NewMI;
+    if (!FirstInstr)
+      FirstInstr = NewMI;
+  };
+
+  // Create scalar mask in tempGPR and broadcast to vector mask
+  recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR)
+                  .addImm(0)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  auto SubReg = TRI->getSubReg(TmpGPR, X86::sub_8bit);
+  recordInstr(BuildMI(*MBB, MI, DL, get(X86::SETCCr))
+                  .addReg(SubReg)
+                  .addImm(CC)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  // Zero-extend byte to 32-bit register (movzbl %al, %eax)
+  recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVZX32rr8), TmpGPR)
+                  .addReg(SubReg)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) {
+    // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31,
+    // %eax)
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR)
+                    .addReg(TmpGPR)
+                    .addImm(31));
+  } else {
+    // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax)
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR)
+                    .addReg(TmpGPR));
+  }
+
+  // Broadcast to TmpX (vector mask)
+  recordInstr(BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg)
+                  .addReg(MaskReg)
+                  .addReg(MaskReg)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  // Move scalar mask to vector register
+  recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.IntMoveOpc), MaskReg)
+                  .addReg(TmpGPR)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  if (Instruction.Use256) {
+    // Broadcast to 256-bit vector register
+    recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+                    .addReg(MaskReg)
+                    .addImm(0)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+  } else {
+    if (Subtarget.hasSSE2() || Subtarget.hasAVX()) {
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+                      .addReg(MaskReg)
+                      .addImm(0x00)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+    } else {
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+                      .addReg(MaskReg)
+                      .addReg(MaskReg)
+                      .addImm(0x00)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+    }
+  }
+
+  if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) {
+    // Use dedicated blend instructions for SSE4.1+
+    unsigned BlendOpc;
+    switch (Opcode) {
+    case X86::CT_SELECT_V4F32:
+      BlendOpc = X86::BLENDVPSrr0;
+      break;
+    case X86::CT_SELECT_V2F64:
+      BlendOpc = X86::BLENDVPDrr0;
+      break;
+    default:
+      // alias for pblendvb that takes xmm0 as implicit mask register
+      BlendOpc = X86::PBLENDVBrr0;
+      break;
+    }
+
+    // Check if XMM0 is used as one of source registers, if yes then save it
+    // in Dst register and update FalseVal and TrueVal to Dst register
+    bool DidSaveXMM0 = false;
+    Register SavedXMM0 = X86::XMM0;
+    if (FalseVal == X86::XMM0 || TrueVal == X86::XMM0) {
+      Register SrcXMM0 = (FalseVal == X86::XMM0) ? FalseVal : TrueVal;
+
+      // if XMM0 is one of the source registers, it will not match with Dst
+      // registers, so we need to move it to Dst register
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                      .addReg(SrcXMM0)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+      // update FalseVal and TrueVal to Dst register
+      if (FalseVal == X86::XMM0)
+        FalseVal = Dst;
+      if (TrueVal == X86::XMM0)
+        TrueVal = Dst;
+
+      // update SavedXMM0 to Dst register
+      SavedXMM0 = Dst;
+
+      // set DidSaveXMM0 to true to indicate that we saved XMM0 into Dst
+      // register
+      DidSaveXMM0 = true;
+    } else if (MaskReg != X86::XMM0 && Dst != X86::XMM0) {
+
+      // if XMM0 is not allocated for any of the register, we stil need to save
+      // and restore it after using as mask register
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                      .addReg(X86::XMM0)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+      SavedXMM0 = Dst;
+      DidSaveXMM0 = true;
+    }
+
+    if (MaskReg != X86::XMM0) {
+      // BLENDV uses XMM0 as implicit mask register
+      // https://www.felixcloutier.com/x86/pblendvb
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0)
+                      .addReg(MaskReg)
+                      .setMIFlag(MachineInstr::MIFlag::NoMerge));
+
+      // move FalseVal to mask (use MaskReg as the dst of the blend)
+      recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), MaskReg)
+                      .addReg(FalseVal)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+      // MaskReg := blend(MaskReg /*false*/, TrueVal /*true*/)  ; mask in
+      // xmm0
+      recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), MaskReg)
+                      .addReg(MaskReg)
+                      .addReg(TrueVal)
+                      .addReg(X86::XMM0)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+      // restore XMM0 from SavedXMM0 if we saved it into Dst
+      if (DidSaveXMM0) {
+        recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0)
+                        .addReg(SavedXMM0)
+                        .setMIFlags(MachineInstr::MIFlag::NoMerge));
+      }
+      // dst = result (now in MaskReg)
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                      .addReg(MaskReg)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+    } else {
+      // move FalseVal to Dst register since MaskReg is XMM0 and Dst is not
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                      .addReg(FalseVal)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+      // Dst := blend(Dst /*false*/, TrueVal /*true*/)  ; mask in
+      // xmm0
+      recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), Dst)
+                      .addReg(Dst)
+                      .addReg(TrueVal)
+                      .addReg(X86::XMM0)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+    }
+  } else {
+
+    // dst = mask
+    recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                    .addReg(MaskReg)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+    // mask &= true_val
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDrr), MaskReg)
+                    .addReg(MaskReg)
+                    .addReg(TrueVal)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+    // dst = ~mask & false_val
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDNrr), Dst)
+                    .addReg(Dst)
+                    .addReg(FalseVal)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+    // dst |= mask; (mask & t) | (~mask & f)
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::PORrr), Dst)
+                    .addReg(Dst)
+                    .addReg(MaskReg)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+  }
+
+  assert(FirstInstr && LastInstr && "Expected at least one expanded instruction");
+  auto BundleEnd = LastInstr->getIterator();
+  finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd));
+
+  MI.eraseFromParent();
+
+  return true;
+}
+
+bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // CT_SELECT pseudo has: (outs dst), (ins true_val, false_val, cond)
+  MachineOperand &OperandRes = MI.getOperand(0);  // destination register
+  MachineOperand &OperandTrue = MI.getOperand(1); // true value
+  MachineOperand &OperandCond = MI.getOperand(3); // condition code
+
+  assert(OperandTrue.isReg() && OperandRes.isReg() && OperandCond.isImm() &&
+         "Invalid operand types");
+  assert(OperandTrue.getReg() == OperandRes.getReg() &&
+         "Result register different from True register");
+
+  assert(Subtarget.hasCMOV() && "target does not support CMOV instructions");
+
+  unsigned Opcode = 0;
+
+  switch (MI.getOpcode()) {
+  case X86::CT_SELECT16rr:
+    Opcode = X86::CMOV16rr;
+    break;
+  case X86::CT_SELECT32rr:
+    Opcode = X86::CMOV32rr;
+    break;
+  case X86::CT_SELECT64rr:
+    Opcode = X86::CMOV64rr;
+    break;
+  case X86::CT_SELECT16rm:
+    Opcode = X86::CMOV16rm;
+    break;
+  case X86::CT_SELECT32rm:
+    Opcode = X86::CMOV32rm;
+    break;
+  case X86::CT_SELECT64rm:
+    Opcode = X86::CMOV64rm;
+    break;
+  default:
+    llvm_unreachable("Invalid CT_SELECT opcode");
+  }
+
+  if (!Subtarget.hasCMOV()) {
+    llvm_unreachable("target does not support cmov");
+  }
+
+  // Build CMOV instruction: copy the first 3 operands (dst, true, false)
+  // and add condition code
+  MachineInstrBuilder CmovBuilder = BuildMI(*MBB, MI, DL, get(Opcode));
+  for (unsigned i = 0u; i < MI.getNumOperands(); ++i) { // Copy
+    CmovBuilder.add(MI.getOperand(i));
+  }
+
+  // Remove the original CT_SELECT instruction
+  MI.eraseFromParent();
+  return true;
+}
+
+/// Expand i386-specific CT_SELECT pseudo instructions (post-RA, constant-time)
+/// These internal pseudos receive a pre-materialized condition byte from the
+/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization.
+bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // CT_SELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask),
+  // (ins src1, src2, cond_byte)
+  // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent
+  Register DstReg = MI.getOperand(0).getReg();
+  Register TmpByteReg = MI.getOperand(1).getReg();
+  Register TmpMaskReg = MI.getOperand(2).getReg();
+  Register Src1Reg = MI.getOperand(3).getReg();
+  Register Src2Reg = MI.getOperand(4).getReg();
+  Register CondByteReg = MI.getOperand(5).getReg();  // Pre-materialized condition byte
+
+  // Determine instruction opcodes based on register width
+  unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp;
+  if (MI.getOpcode() == X86::CT_SELECT_I386_INT_GR8rr) {
+    MovZXOp = 0;  // No zero-extend needed for GR8
+    NegOp = X86::NEG8r;
+    MovOp = X86::MOV8rr;
+    AndOp = X86::AND8rr;
+    NotOp = X86::NOT8r;
+    OrOp = X86::OR8rr;
+  } else if (MI.getOpcode() == X86::CT_SELECT_I386_INT_GR16rr) {
+    MovZXOp = X86::MOVZX16rr8;
+    NegOp = X86::NEG16r;
+    MovOp = X86::MOV16rr;
+    AndOp = X86::AND16rr;
+    NotOp = X86::NOT16r;
+    OrOp = X86::OR16rr;
+  } else { // X86::CT_SELECT_I386_INT_GR32rr
+    MovZXOp = X86::MOVZX32rr8;
+    NegOp = X86::NEG32r;
+    MovOp = X86::MOV32rr;
+    AndOp = X86::AND32rr;
+    NotOp = X86::NOT32r;
+    OrOp = X86::OR32rr;
+  }
+
+  // 7-instruction constant-time selection bundle (no SETCC inside):
+  // result = (true_val & mask) | (false_val & ~mask)
+  // The condition byte is already materialized, avoiding EFLAGS dependency
+
+  // Step 1: Copy pre-materialized condition byte to TmpByteReg
+  // This allows the bundle to work with allocated temporaries
+  auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg)
+      .addReg(CondByteReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+  auto BundleStart = I1->getIterator();
+
+  // Step 2: Zero-extend condition byte to register width (0 or 1)
+  if (MI.getOpcode() != X86::CT_SELECT_I386_INT_GR8rr) {
+    BuildMI(*MBB, MI, DL, get(MovZXOp), TmpMaskReg)
+        .addReg(TmpByteReg)
+        .setMIFlag(MachineInstr::MIFlag::NoMerge);
+  }
+
+  // Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...)
+  Register MaskReg = (MI.getOpcode() == X86::CT_SELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg;
+  BuildMI(*MBB, MI, DL, get(NegOp), MaskReg)
+      .addReg(MaskReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Step 4,5: Apply mask to true value - copy src1 to dest, then AND with mask
+  BuildMI(*MBB, MI, DL, get(MovOp), DstReg)
+      .addReg(Src1Reg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  BuildMI(*MBB, MI, DL, get(AndOp), DstReg)
+      .addReg(DstReg)
+      .addReg(MaskReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Step 6: Create inverted mask inline (~mask)
+  BuildMI(*MBB, MI, DL, get(NotOp), MaskReg)
+      .addReg(MaskReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Step 7: Apply inverted mask to false value - reuse mask register directly
+  BuildMI(*MBB, MI, DL, get(AndOp), MaskReg)
+      .addReg(MaskReg)
+      .addReg(Src2Reg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Step 8: Final result: (src1 & mask) | (src2 & ~mask)
+  auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg)
+      .addReg(DstReg)
+      .addReg(MaskReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Bundle all generated instructions for atomic execution before removing MI
+  auto BundleEnd = std::next(LI->getIterator());
+  if (BundleStart != BundleEnd) {
+    // Only bundle if we have multiple instructions
+    finalizeBundle(*MBB, BundleStart, BundleEnd);
+  }
+
+  // TODO: Optimization opportunity - The register allocator may choose callee-saved
+  // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary
+  // save/restore overhead. Consider constraining these to caller-saved register
+  // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve
+  // constant-time performance by eliminating prologue/epilogue instructions.
+
+  // Remove the original pseudo instruction
+  MI.eraseFromParent();
+  return true;
+}
+
 static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) {
   switch (Opcode) {
   default:
@@ -6426,6 +6976,43 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::ADD64ri32_DB:
     MIB->setDesc(get(X86::OR64ri32));
     break;
+
+  case X86::CT_SELECT64rr:
+  case X86::CT_SELECT32rr:
+  case X86::CT_SELECT16rr:
+  case X86::CT_SELECT64rm:
+  case X86::CT_SELECT32rm:
+  case X86::CT_SELECT16rm:
+    // These CT_SELECT pseudos are only selected when CMOV is available
+    // Pattern matching ensures we use CT_SELECT_I386 when CMOV is not available
+    return expandCtSelectWithCMOV(MI);
+
+  // non-cmov CT_SELECT expansion (post-RA, constant-time)
+  // These are the internal pseudos with pre-materialized condition byte
+  case X86::CT_SELECT_I386_INT_GR8rr:
+  case X86::CT_SELECT_I386_INT_GR16rr:
+  case X86::CT_SELECT_I386_INT_GR32rr:
+    return expandCtSelectIntWithoutCMOV(MI);
+
+  case X86::CT_SELECT_V2F64:
+  case X86::CT_SELECT_V4F32:
+  case X86::CT_SELECT_V2I64:
+  case X86::CT_SELECT_V4I32:
+  case X86::CT_SELECT_V8I16:
+  case X86::CT_SELECT_V16I8:
+  case X86::CT_SELECT_V2F64X:
+  case X86::CT_SELECT_V4F32X:
+  case X86::CT_SELECT_V2I64X:
+  case X86::CT_SELECT_V4I32X:
+  case X86::CT_SELECT_V8I16X:
+  case X86::CT_SELECT_V16I8X:
+  case X86::CT_SELECT_V4I64:
+  case X86::CT_SELECT_V8I32:
+  case X86::CT_SELECT_V16I16:
+  case X86::CT_SELECT_V32I8:
+  case X86::CT_SELECT_V4F64:
+  case X86::CT_SELECT_V8F32:
+    return expandCtSelectVector(MI);
   }
   return false;
 }
@@ -10612,27 +11199,39 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
     if (!ST.hasSSE1())
       return;
 
-    BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
+    // PXOR is safe to use because it doesn't affect flags.
+    BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
   } else if (X86::VR256RegClass.contains(Reg)) {
     // YMM#
     if (!ST.hasAVX())
       return;
 
-    BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
+    // VPXOR is safe to use because it doesn't affect flags.
+    BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
   } else if (X86::VR512RegClass.contains(Reg)) {
     // ZMM#
     if (!ST.hasAVX512())
       return;
 
-    BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
+    // VPXORY is safe to use because it doesn't affect flags.
+    BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
   } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
              X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
              X86::VK16RegClass.contains(Reg)) {
     if (!ST.hasVLX())
       return;
 
-    unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
-    BuildMI(MBB, Iter, DL, get(Op), Reg);
+    // KXOR is safe to use because it doesn't affect flags.
+    unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk;
+    BuildMI(MBB, Iter, DL, get(Op), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index de8ccb44578a3..76f18803c2e3d 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -721,6 +721,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
   bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
                       int &FrameIndex) const;
 
+  /// Expand the CT_SELECT pseudo-instructions.
+  bool expandCtSelectWithCMOV(MachineInstr &MI) const;
+  bool expandCtSelectIntWithoutCMOV(MachineInstr &MI) const;
+
+  bool expandCtSelectVector(MachineInstr &MI) const;
+
   /// Returns true iff the routine could find two commutable operands in the
   /// given machine instruction with 3 vector inputs.
   /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 21e6bacbacee2..5fa0665668e43 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -53,6 +53,11 @@ def PreferNoLegacySetCC : Predicate<"Subtarget->hasZU() && "
 def HasCF        : Predicate<"Subtarget->hasCF()">;
 def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">;
 def NoCMOV       : Predicate<"!Subtarget->canUseCMOV()">;
+// Predicates for native CMOV instruction (checks hasCMOV(), not canUseCMOV())
+// HasCMOV may be true even without native CMOV (e.g., via SSE emulation)
+// Use HasNativeCMOV/NoNativeCMOV for constant-time code that requires actual CMOV
+def HasNativeCMOV : Predicate<"Subtarget->hasCMOV()">;
+def NoNativeCMOV  : Predicate<"!Subtarget->hasCMOV()">;
 def HasNOPL      : Predicate<"Subtarget->hasNOPL()">;
 def HasMMX       : Predicate<"Subtarget->hasMMX()">;
 def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 5305b39cffefd..48bcdb41ba6e2 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -630,10 +630,11 @@ void X86PassConfig::addPreEmitPass2() {
     // ObjC runtime functions present in the module.
     const Function &F = MF.getFunction();
     const Module *M = F.getParent();
-    return M->getModuleFlag("kcfi") ||
+    return M->getModuleFlag("kcfi") || F.hasFnAttribute("ct-select") ||
            (TT.isOSDarwin() &&
             (M->getFunction("objc_retainAutoreleasedReturnValue") ||
-             M->getFunction("objc_unsafeClaimAutoreleasedReturnValue")));
+             M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) ||
+             F.hasFnAttribute("ct-select");
   }));
 
   // Analyzes and emits pseudos to support Win x64 Unwind V2. This pass must run
diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll
new file mode 100644
index 0000000000000..0797265972a1f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll
@@ -0,0 +1,409 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32
+
+; Test ct.select edge cases and corner cases
+
+; Test with very large integers
+define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) {
+; X64-LABEL: test_ctselect_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    cmovneq %rdx, %r8
+; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i128:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, 12(%eax)
+; X32-NEXT:    movl %edx, 8(%eax)
+; X32-NEXT:    movl %edi, 4(%eax)
+; X32-NEXT:    movl %esi, (%eax)
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl $4
+  %result = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b)
+  ret i128 %result
+}
+
+; Test with small integer types
+define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+; X64-LABEL: test_ctselect_i1:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i1:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    # kill: def $al killed $al killed $eax
+; X32-NEXT:    retl
+  %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+  ret i1 %result
+}
+
+; Test with extremal values
+define i32 @test_ctselect_extremal_values(i1 %cond) {
+; X64-LABEL: test_ctselect_extremal_values:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_extremal_values:
+; X32:       # %bb.0:
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X32-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    retl
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648)
+  ret i32 %result
+}
+
+; Test with floating point special values
+define float @test_ctselect_f32_special_values(i1 %cond) {
+; X64-LABEL: test_ctselect_f32_special_values:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    movl $2139095040, %ecx # imm = 0x7F800000
+; X64-NEXT:    cmovnel %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f32_special_values:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    sete %al
+; X32-NEXT:    movl {{\.?LCPI[0-9]+_[0-9]+}}, %ecx
+; X32-NEXT:    movl {{\.?LCPI[0-9]+_[0-9]+}}, %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
+  ret float %result
+}
+
+define double @test_ctselect_f64_special_values(i1 %cond) {
+; X64-LABEL: test_ctselect_f64_special_values:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; X64-NEXT:    movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    movq %rcx, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f64_special_values:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    subl $24, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 36
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X32-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X32-NEXT:    sete %al
+; X32-NEXT:    fxch %st(1)
+; X32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X32-NEXT:    fstpl (%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $24, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+  %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000)
+  ret double %result
+}
+
+; Test with null pointers
+define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+; X64-LABEL: test_ctselect_null_ptr:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_null_ptr:
+; X32:       # %bb.0:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+  ret ptr %result
+}
+
+; Test with function pointers
+define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+; X64-LABEL: test_ctselect_function_ptr:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_function_ptr:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+  ret ptr %result
+}
+
+; Test with volatile loads
+define i32 @test_ctselect_volatile_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_volatile_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_volatile_load:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    retl
+  %a = load volatile i32, ptr %p1
+  %b = load volatile i32, ptr %p2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with atomic loads
+define i32 @test_ctselect_atomic_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_atomic_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_atomic_load:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    retl
+  %a = load atomic i32, ptr %p1 acquire, align 4
+  %b = load atomic i32, ptr %p2 acquire, align 4
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with condition from icmp on pointers
+define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_ptr_cmp:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovneq %rdx, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_ptr_cmp:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    sete %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+  %cmp = icmp eq ptr %p1, %p2
+  %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with struct pointer types (struct types themselves may not be directly supported)
+%struct.pair = type { i32, i32 }
+
+define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_struct_ptr:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_struct_ptr:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with deeply nested conditions (stress test for instruction selection)
+define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+; X64-LABEL: test_ctselect_deeply_nested:
+; X64:       # %bb.0:
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %r8d, %r9d
+; X64-NEXT:    testb $1, %sil
+; X64-NEXT:    cmovnel %r9d, %r11d
+; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    cmovnel %r11d, %r10d
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    cmovnel %r10d, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_deeply_nested:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %esi, %edx
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %edx, %ecx
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+  %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+  %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e)
+  ret i32 %sel4
+}
+
+; Test with misaligned loads
+define i32 @test_ctselect_misaligned_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_misaligned_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel (%rsi), %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_misaligned_load:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel (%ecx), %eax
+; X32-NEXT:    retl
+  %a = load i32, ptr %p1, align 1
+  %b = load i32, ptr %p2, align 1
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
+declare i128 @llvm.ct.select.i128(i1, i128, i128)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
new file mode 100644
index 0000000000000..b88ec72a37925
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Comprehensive CT_SELECT tests for i386 targets with floating-point types
+; - Without CMOV: constant-time implementation using FP->int conversion + existing post-RA CT_SELECT
+; - With CMOV: CMOV-based implementation
+; - Verifies security properties: no conditional branches, constant execution time
+; Strategy: FP values stored to memory, converted to integers, CT_SELECT on integers, converted back to FP
+
+; Test basic f32 functionality
+define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test f32 with different condition codes
+define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_eq:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fucompp
+; I386-NOCMOV-NEXT:    fnstsw %ax
+; I386-NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
+; I386-NOCMOV-NEXT:    sahf
+; I386-NOCMOV-NEXT:    setnp %al
+; I386-NOCMOV-NEXT:    sete %cl
+; I386-NOCMOV-NEXT:    testb %al, %cl
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_eq:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fucompi %st(1), %st
+; I386-CMOV-NEXT:    fstp %st(0)
+; I386-CMOV-NEXT:    setnp %al
+; I386-CMOV-NEXT:    sete %cl
+; I386-CMOV-NEXT:    testb %al, %cl
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %cmp = fcmp oeq float %x, %y
+  %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b)
+  ret float %result
+}
+
+; Test basic f64 functionality
+define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f64_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $8, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldl (%esp)
+; I386-NOCMOV-NEXT:    addl $8, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f64_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $8, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldl (%esp)
+; I386-CMOV-NEXT:    addl $8, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+; Test basic x86_fp80 functionality
+define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f80_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $12, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt (%esp)
+; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f80_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldt (%esp)
+; I386-CMOV-NEXT:    addl $12, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
+  ret x86_fp80 %result
+}
+
+; Test f32 with complex conditions
+define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_gt:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fucompp
+; I386-NOCMOV-NEXT:    fnstsw %ax
+; I386-NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
+; I386-NOCMOV-NEXT:    sahf
+; I386-NOCMOV-NEXT:    seta %al
+; I386-NOCMOV-NEXT:    testb %al, %al
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_gt:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fucompi %st(1), %st
+; I386-CMOV-NEXT:    fstp %st(0)
+; I386-CMOV-NEXT:    seta %al
+; I386-CMOV-NEXT:    testb %al, %al
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %cmp = fcmp ogt float %x, %y
+  %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b)
+  ret float %result
+}
+
+; Test constant-time properties: verify no branches in generated code
+define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_no_branches:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_no_branches:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test that BUNDLE directives are present for constant-time guarantees
+define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_bundled:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_bundled:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test edge case: NaN handling
+define float @test_ctselect_f32_nan(i1 %cond) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_nan:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $12, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; I386-NOCMOV-NEXT:    fldz
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    fxch %st(1)
+; I386-NOCMOV-NEXT:    fstps {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fstps (%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl (%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_nan:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; I386-CMOV-NEXT:    fldz
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    fxch %st(1)
+; I386-CMOV-NEXT:    fstps {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fstps (%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl (%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    addl $12, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %nan = bitcast i32 2139095040 to float  ; 0x7F800000 = +inf
+  %zero = bitcast i32 0 to float
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %nan, float %zero)
+  ret float %result
+}
+
+; Test memory alignment for f80
+define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f80_alignment:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $12, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt (%esp)
+; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f80_alignment:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldt (%esp)
+; I386-CMOV-NEXT:    addl $12, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
+  ret x86_fp80 %result
+}
+
+; Stress test: multiple CT_SELECT operations
+define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float %b, float %c, float %d) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_multiple:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $8, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $8, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_multiple:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $8, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $8, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %sel1 = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b)
+  %sel2 = call float @llvm.ct.select.f32(i1 %cond2, float %sel1, float %c)
+  ret float %sel2
+}
+
+; Declare intrinsics
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare x86_fp80 @llvm.ct.select.f80(i1, x86_fp80, x86_fp80)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
new file mode 100644
index 0000000000000..6851c5babeb2d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
@@ -0,0 +1,428 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx < %s | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+cmov,+mmx < %s | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx -verify-machineinstrs < %s | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Test constant-time selection with MMX intrinsics to exercise VR64 CT_SELECT
+; These tests use MMX intrinsics to create <1 x i64> values that get allocated to VR64 registers
+
+; Test MMX ct.select using paddd intrinsic to force VR64 allocation
+define <1 x i64> @test_mmx_ctselect_with_paddd(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_paddd:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    subl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %bl
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    sete %bh
+; I386-NOCMOV-NEXT:    movb %bh, %al
+; I386-NOCMOV-NEXT:    movzbl %al, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %esi, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %ecx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %ecx
+; I386-NOCMOV-NEXT:    andl %esi, %ecx
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT:    paddd %mm0, %mm0
+; I386-NOCMOV-NEXT:    movq %mm0, (%esp)
+; I386-NOCMOV-NEXT:    movl (%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    addl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    popl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_paddd:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    subl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %dl
+; I386-CMOV-NEXT:    testb %dl, %dl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT:    paddd %mm0, %mm0
+; I386-CMOV-NEXT:    movq %mm0, (%esp)
+; I386-CMOV-NEXT:    movl (%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    addl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT:    retl
+  %mmx_a = bitcast i64 %a to <1 x i64>
+  %mmx_b = bitcast i64 %b to <1 x i64>
+  %cmp = icmp ne i32 %cond, 0
+  %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+  %result = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %sel, <1 x i64> %sel)
+  ret <1 x i64> %result
+}
+
+; Test MMX ct.select using psllw intrinsic
+define <1 x i64> @test_mmx_ctselect_with_psllw(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_psllw:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    subl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %bl
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    sete %bh
+; I386-NOCMOV-NEXT:    movb %bh, %al
+; I386-NOCMOV-NEXT:    movzbl %al, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %esi, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %ecx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %ecx
+; I386-NOCMOV-NEXT:    andl %esi, %ecx
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT:    psllw %mm0, %mm0
+; I386-NOCMOV-NEXT:    movq %mm0, (%esp)
+; I386-NOCMOV-NEXT:    movl (%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    addl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    popl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_psllw:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    subl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %dl
+; I386-CMOV-NEXT:    testb %dl, %dl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT:    psllw %mm0, %mm0
+; I386-CMOV-NEXT:    movq %mm0, (%esp)
+; I386-CMOV-NEXT:    movl (%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    addl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT:    retl
+  %mmx_a = bitcast i64 %a to <1 x i64>
+  %mmx_b = bitcast i64 %b to <1 x i64>
+  %cmp = icmp ne i32 %cond, 0
+  %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+  %result = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %sel, <1 x i64> %sel)
+  ret <1 x i64> %result
+}
+
+; Test nested MMX ct.selects with pand intrinsic
+define <1 x i64> @test_mmx_nested_ctselect_with_pand(i32 %cond1, i32 %cond2, i64 %a, i64 %b, i64 %c) {
+; I386-NOCMOV-LABEL: test_mmx_nested_ctselect_with_pand:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    subl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %bl
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    sete %bh
+; I386-NOCMOV-NEXT:    movb %bh, %cl
+; I386-NOCMOV-NEXT:    movzbl %cl, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %edx, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %eax, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    sete %dl
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movb %dl, %dh
+; I386-NOCMOV-NEXT:    movzbl %dh, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %esi, %ebx
+; I386-NOCMOV-NEXT:    andl %ebp, %ebx
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %eax, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %ebx
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %dl
+; I386-NOCMOV-NEXT:    testb %dl, %dl
+; I386-NOCMOV-NEXT:    sete %dh
+; I386-NOCMOV-NEXT:    movb %dh, %al
+; I386-NOCMOV-NEXT:    movzbl %al, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %ecx, %esi
+; I386-NOCMOV-NEXT:    andl %ebp, %esi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %ebx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %esi
+; I386-NOCMOV-NEXT:    testb %dl, %dl
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; I386-NOCMOV-NEXT:    movb %al, %dl
+; I386-NOCMOV-NEXT:    movzbl %dl, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %ebx, %ecx
+; I386-NOCMOV-NEXT:    andl %esi, %ecx
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT:    pand %mm0, %mm0
+; I386-NOCMOV-NEXT:    movq %mm0, (%esp)
+; I386-NOCMOV-NEXT:    movl (%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    addl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    popl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_mmx_nested_ctselect_with_pand:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %ebx
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-CMOV-NEXT:    subl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 32
+; I386-CMOV-NEXT:    .cfi_offset %esi, -12
+; I386-CMOV-NEXT:    .cfi_offset %ebx, -8
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %bl
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %bh
+; I386-CMOV-NEXT:    testb %bh, %bh
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %esi
+; I386-CMOV-NEXT:    testb %bl, %bl
+; I386-CMOV-NEXT:    cmovnel %esi, %edx
+; I386-CMOV-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel %ecx, %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT:    pand %mm0, %mm0
+; I386-CMOV-NEXT:    movq %mm0, (%esp)
+; I386-CMOV-NEXT:    movl (%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    addl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-CMOV-NEXT:    popl %ebx
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT:    retl
+  %mmx_a = bitcast i64 %a to <1 x i64>
+  %mmx_b = bitcast i64 %b to <1 x i64>
+  %mmx_c = bitcast i64 %c to <1 x i64>
+  %cmp1 = icmp ne i32 %cond1, 0
+  %cmp2 = icmp ne i32 %cond2, 0
+  %sel1 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp2, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+  %sel2 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp1, <1 x i64> %sel1, <1 x i64> %mmx_c)
+  %result = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %sel2, <1 x i64> %sel2)
+  ret <1 x i64> %result
+}
+
+; Test MMX ct.select with por intrinsic
+define <1 x i64> @test_mmx_ctselect_with_por(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_por:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    subl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %bl
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    sete %bh
+; I386-NOCMOV-NEXT:    movb %bh, %al
+; I386-NOCMOV-NEXT:    movzbl %al, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %esi, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %ecx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %ecx
+; I386-NOCMOV-NEXT:    andl %esi, %ecx
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT:    por %mm0, %mm0
+; I386-NOCMOV-NEXT:    movq %mm0, (%esp)
+; I386-NOCMOV-NEXT:    movl (%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    addl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    popl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_por:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    subl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %dl
+; I386-CMOV-NEXT:    testb %dl, %dl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT:    por %mm0, %mm0
+; I386-CMOV-NEXT:    movq %mm0, (%esp)
+; I386-CMOV-NEXT:    movl (%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    addl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT:    retl
+  %mmx_a = bitcast i64 %a to <1 x i64>
+  %mmx_b = bitcast i64 %b to <1 x i64>
+  %cmp = icmp ne i32 %cond, 0
+  %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+  %result = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %sel, <1 x i64> %sel)
+  ret <1 x i64> %result
+}
+
+; Declare MMX intrinsics
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>)
+
+; Declare constant-time selection intrinsic
+declare <1 x i64> @llvm.ct.select.v1i64(i1, <1 x i64>, <1 x i64>)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386.ll b/llvm/test/CodeGen/X86/ctselect-i386.ll
new file mode 100644
index 0000000000000..d1cc559f0c1c1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Comprehensive CT_SELECT tests for i386 targets with scalar integer types
+; - Without CMOV: constant-time implementation using post-RA expansion with bundled instructions
+; - With CMOV: CMOV-based implementation
+; - Verifies security properties: no conditional branches, constant execution time
+; All expansion happens post-RA for better optimization control and constant-time guarantees
+
+; Test basic i32 functionality
+define i32 @test_ctselect_i32_basic(i1 %cond, i32 %a, i32 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i32_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %eax
+; I386-NOCMOV-NEXT:    andl %esi, %eax
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %ecx, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_i32_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    retl
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test i16 functionality
+define i16 @test_ctselect_i16_basic(i1 %cond, i16 %a, i16 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i16_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbw %bh, %si
+; I386-NOCMOV-NEXT:    negw %si
+; I386-NOCMOV-NEXT:    movw %dx, %ax
+; I386-NOCMOV-NEXT:    andw %si, %ax
+; I386-NOCMOV-NEXT:    notw %si
+; I386-NOCMOV-NEXT:    andw %cx, %si
+; I386-NOCMOV-NEXT:    orw %si, %ax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_i16_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnew {{[0-9]+}}(%esp), %ax
+; I386-CMOV-NEXT:    retl
+  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %result
+}
+
+; Test i8 functionality
+define i8 @test_ctselect_i8_basic(i1 %cond, i8 %a, i8 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i8_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %ah
+; I386-NOCMOV-NEXT:    movb %ah, %ch
+; I386-NOCMOV-NEXT:    negb %ch
+; I386-NOCMOV-NEXT:    movb %dl, %al
+; I386-NOCMOV-NEXT:    andb %ch, %al
+; I386-NOCMOV-NEXT:    notb %ch
+; I386-NOCMOV-NEXT:    andb %cl, %ch
+; I386-NOCMOV-NEXT:    orb %ch, %al
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_i8_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I386-CMOV-NEXT:    retl
+  %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %result
+}
+
+; Test security property: constant-time execution for cryptographic use case
+define i32 @test_crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwind {
+; I386-NOCMOV-LABEL: test_crypto_key_select:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %al
+; I386-NOCMOV-NEXT:    testb %al, %al
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %eax
+; I386-NOCMOV-NEXT:    andl %esi, %eax
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %ecx, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_crypto_key_select:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %cl
+; I386-CMOV-NEXT:    testb %cl, %cl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    retl
+  %cond = icmp ne i32 %secret_bit, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %key1, i32 %key2)
+  ret i32 %result
+}
+
+; Test that no conditional branches appear in constant-time path
+define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) nounwind {
+; I386-NOCMOV-LABEL: test_no_conditional_branches:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %al
+; I386-NOCMOV-NEXT:    testb %al, %al
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %eax
+; I386-NOCMOV-NEXT:    andl %esi, %eax
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %ecx, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_no_conditional_branches:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %cl
+; I386-CMOV-NEXT:    testb %cl, %cl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    retl
+  %cond = icmp ne i32 %secret, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2)
+  ret i32 %result
+}
+
+; Test with comparison condition
+define i32 @test_ctselect_i32_cmp(i32 %a, i32 %b, i32 %c) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i32_cmp:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    testb %al, %al
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %eax
+; I386-NOCMOV-NEXT:    andl %esi, %eax
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %ecx, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_i32_cmp:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %cl
+; I386-CMOV-NEXT:    testb %cl, %cl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    retl
+  %cond = icmp eq i32 %a, %c
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c)
+  ret i32 %result
+}
+
+; Test nested selects
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_nested:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %eax, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %dl
+; I386-NOCMOV-NEXT:    movb %dl, %dh
+; I386-NOCMOV-NEXT:    movzbl %dh, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %ecx, %eax
+; I386-NOCMOV-NEXT:    andl %edi, %eax
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %esi, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_nested:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel %ecx, %eax
+; I386-CMOV-NEXT:    retl
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %sel1, i32 %c)
+  ret i32 %sel2
+}
+
+; Declare ct.select intrinsics
+declare i8 @llvm.ct.select.i8(i1, i8, i8)
+declare i16 @llvm.ct.select.i16(i1, i16, i16)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
diff --git a/llvm/test/CodeGen/X86/ctselect-optimization.ll b/llvm/test/CodeGen/X86/ctselect-optimization.ll
new file mode 100644
index 0000000000000..481d49971a937
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-optimization.ll
@@ -0,0 +1,304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s
+
+; Test ct.select optimization patterns
+
+; Test smin(x, 0) pattern optimization
+define i32 @test_ctselect_smin_zero(i32 %x) {
+; CHECK-LABEL: test_ctselect_smin_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+  ret i32 %result
+}
+
+; Test smax(x, 0) pattern optimization
+define i32 @test_ctselect_smax_zero(i32 %x) {
+; CHECK-LABEL: test_ctselect_smax_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    setg %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp sgt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+  ret i32 %result
+}
+
+; Test generic smin pattern
+define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_smin_generic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    setl %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test generic smax pattern
+define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_smax_generic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    setg %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp sgt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test umin pattern
+define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_umin_generic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    setb %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ult i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test umax pattern
+define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_umax_generic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    seta %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ugt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test abs pattern
+define i32 @test_ctselect_abs(i32 %x) {
+; CHECK-LABEL: test_ctselect_abs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    negl %ecx
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %dl
+; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %neg = sub i32 0, %x
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x)
+  ret i32 %result
+}
+
+; Test nabs pattern (negative abs)
+define i32 @test_ctselect_nabs(i32 %x) {
+; CHECK-LABEL: test_ctselect_nabs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %neg = sub i32 0, %x
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg)
+  ret i32 %result
+}
+
+; Test sign extension pattern
+define i32 @test_ctselect_sign_extend(i32 %x) {
+; CHECK-LABEL: test_ctselect_sign_extend:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movl $-1, %ecx
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+  ret i32 %result
+}
+
+; Test zero extension pattern
+define i32 @test_ctselect_zero_extend(i32 %x) {
+; CHECK-LABEL: test_ctselect_zero_extend:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movl $1, %ecx
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ne i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0)
+  ret i32 %result
+}
+
+; Test mask generation pattern
+define i32 @test_ctselect_mask_generation(i32 %x) {
+; CHECK-LABEL: test_ctselect_mask_generation:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movl $-1, %ecx
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+  ret i32 %result
+}
+
+; Test constant folding with known condition
+define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_constant_folding_true:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movb $1, %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_constant_folding_false:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with identical operands
+define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+; CHECK-LABEL: test_ctselect_identical_operands:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    retq
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x)
+  ret i32 %result
+}
+
+; Test with inverted condition
+define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_inverted_condition:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    sete %dl
+; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp eq i32 %x, %y
+  %not_cmp = xor i1 %cmp, true
+  %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test for 64-bit specific optimizations
+define i64 @test_ctselect_i64_smin_zero(i64 %x) {
+; CHECK-LABEL: test_ctselect_i64_smin_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovneq %rdi, %rax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i64 %x, 0
+  %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0)
+  ret i64 %result
+}
+
+; Test for floating point optimizations
+define float @test_ctselect_f32_zero_positive(float %x) {
+; CHECK-LABEL: test_ctselect_f32_zero_positive:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    seta %cl
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %eax, %edx
+; CHECK-NEXT:    movd %edx, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = fcmp ogt float %x, 0.0
+  %result = call float @llvm.ct.select.f32(i1 %cmp, float %x, float 0.0)
+  ret float %result
+}
+
+define double @test_ctselect_f64_zero_positive(double %x) {
+; CHECK-LABEL: test_ctselect_f64_zero_positive:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %xmm0, %rax
+; CHECK-NEXT:    xorpd %xmm1, %xmm1
+; CHECK-NEXT:    ucomisd %xmm1, %xmm0
+; CHECK-NEXT:    seta %cl
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovneq %rax, %rdx
+; CHECK-NEXT:    movq %rdx, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = fcmp ogt double %x, 0.0
+  %result = call double @llvm.ct.select.f64(i1 %cmp, double %x, double 0.0)
+  ret double %result
+}
+
+; Test chain of ct.select operations
+define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: test_ctselect_chain:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmovnel %ecx, %r8d
+; CHECK-NEXT:    testb $1, %sil
+; CHECK-NEXT:    cmovnel %r8d, %r9d
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovnel %r9d, %eax
+; CHECK-NEXT:    retq
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+  %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+  ret i32 %sel3
+}
+
+; Declare the intrinsics
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll
new file mode 100644
index 0000000000000..2206e32cd6d34
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-vector.ll
@@ -0,0 +1,1274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+
+; Test ct.select functionality for vector types
+
+; 128-bit vectors
+define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB0_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %xmm0, %xmm1
+; AVX512-NEXT:  .LBB0_2:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: test_ctselect_v4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB1_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %xmm0, %xmm1
+; AVX512-NEXT:  .LBB1_2:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB2_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %xmm0, %xmm1
+; AVX512-NEXT:  .LBB2_2:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %result
+}
+
+define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
+; SSE2-LABEL: test_ctselect_v2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB3_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovapd %xmm0, %xmm1
+; AVX512-NEXT:  .LBB3_2:
+; AVX512-NEXT:    vmovapd %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+; 256-bit vectors
+define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v8i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm3, %ymm3
+; AVX-NEXT:    vmovd %eax, %ymm3
+; AVX-NEXT:    vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX-NEXT:    pand %ymm0, %ymm3
+; AVX-NEXT:    pandn %ymm1, %ymm2
+; AVX-NEXT:    por %ymm3, %ymm2
+; AVX-NEXT:    vmovaps %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm3, %ymm3
+; AVX2-NEXT:    vmovd %eax, %ymm3
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX2-NEXT:    pand %ymm0, %ymm3
+; AVX2-NEXT:    pandn %ymm1, %ymm2
+; AVX2-NEXT:    por %ymm3, %ymm2
+; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB4_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %ymm0, %ymm1
+; AVX512-NEXT:  .LBB4_2:
+; AVX512-NEXT:    vmovaps %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %result
+}
+
+define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: test_ctselect_v8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v8f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm3, %ymm3
+; AVX-NEXT:    vmovd %eax, %ymm3
+; AVX-NEXT:    vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX-NEXT:    pand %ymm0, %ymm3
+; AVX-NEXT:    pandn %ymm1, %ymm2
+; AVX-NEXT:    por %ymm3, %ymm2
+; AVX-NEXT:    vmovaps %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm3, %ymm3
+; AVX2-NEXT:    vmovd %eax, %ymm3
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX2-NEXT:    pand %ymm0, %ymm3
+; AVX2-NEXT:    pandn %ymm1, %ymm2
+; AVX2-NEXT:    por %ymm3, %ymm2
+; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB5_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %ymm0, %ymm1
+; AVX512-NEXT:  .LBB5_2:
+; AVX512-NEXT:    vmovaps %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %result = call <8 x float> @llvm.ct.select.v8f32(i1 %cond, <8 x float> %a, <8 x float> %b)
+  ret <8 x float> %result
+}
+
+define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm3, %ymm3
+; AVX-NEXT:    vmovd %eax, %ymm3
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX-NEXT:    pand %ymm0, %ymm3
+; AVX-NEXT:    pandn %ymm1, %ymm2
+; AVX-NEXT:    por %ymm3, %ymm2
+; AVX-NEXT:    vmovaps %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm3, %ymm3
+; AVX2-NEXT:    vmovd %eax, %ymm3
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX2-NEXT:    pand %ymm0, %ymm3
+; AVX2-NEXT:    pandn %ymm1, %ymm2
+; AVX2-NEXT:    por %ymm3, %ymm2
+; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB6_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %ymm0, %ymm1
+; AVX512-NEXT:  .LBB6_2:
+; AVX512-NEXT:    vmovaps %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %result = call <4 x i64> @llvm.ct.select.v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b)
+  ret <4 x i64> %result
+}
+
+define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: test_ctselect_v4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm3, %ymm3
+; AVX-NEXT:    vmovd %eax, %ymm3
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX-NEXT:    pand %ymm0, %ymm3
+; AVX-NEXT:    pandn %ymm1, %ymm2
+; AVX-NEXT:    por %ymm3, %ymm2
+; AVX-NEXT:    vmovaps %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm3, %ymm3
+; AVX2-NEXT:    vmovd %eax, %ymm3
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX2-NEXT:    pand %ymm0, %ymm3
+; AVX2-NEXT:    pandn %ymm1, %ymm2
+; AVX2-NEXT:    por %ymm3, %ymm2
+; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB7_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovapd %ymm0, %ymm1
+; AVX512-NEXT:  .LBB7_2:
+; AVX512-NEXT:    vmovapd %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %result = call <4 x double> @llvm.ct.select.v4f64(i1 %cond, <4 x double> %a, <4 x double> %b)
+  ret <4 x double> %result
+}
+
+; 512-bit vectors (AVX512 only)
+define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pandn %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm7, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm8, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    movaps %xmm5, %xmm2
+; SSE2-NEXT:    movaps %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v16i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm5, %ymm5
+; AVX-NEXT:    vmovd %eax, %ymm5
+; AVX-NEXT:    vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX-NEXT:    pand %ymm0, %ymm5
+; AVX-NEXT:    pandn %ymm2, %ymm4
+; AVX-NEXT:    por %ymm5, %ymm4
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm0, %ymm0
+; AVX-NEXT:    vmovd %eax, %ymm0
+; AVX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX-NEXT:    pand %ymm1, %ymm0
+; AVX-NEXT:    pandn %ymm3, %ymm2
+; AVX-NEXT:    por %ymm0, %ymm2
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vmovaps %ymm2, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm5, %ymm5
+; AVX2-NEXT:    vmovd %eax, %ymm5
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    pand %ymm0, %ymm5
+; AVX2-NEXT:    pandn %ymm2, %ymm4
+; AVX2-NEXT:    por %ymm5, %ymm4
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX2-NEXT:    pand %ymm1, %ymm0
+; AVX2-NEXT:    pandn %ymm3, %ymm2
+; AVX2-NEXT:    por %ymm0, %ymm2
+; AVX2-NEXT:    vmovaps %ymm4, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB8_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %zmm0, %zmm1
+; AVX512-NEXT:  .LBB8_2:
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %result = call <16 x i32> @llvm.ct.select.v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b)
+  ret <16 x i32> %result
+}
+
+define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) {
+; SSE2-LABEL: test_ctselect_v16f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pandn %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm7, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm8, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    movaps %xmm5, %xmm2
+; SSE2-NEXT:    movaps %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v16f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm5, %ymm5
+; AVX-NEXT:    vmovd %eax, %ymm5
+; AVX-NEXT:    vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX-NEXT:    pand %ymm0, %ymm5
+; AVX-NEXT:    pandn %ymm2, %ymm4
+; AVX-NEXT:    por %ymm5, %ymm4
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm0, %ymm0
+; AVX-NEXT:    vmovd %eax, %ymm0
+; AVX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX-NEXT:    pand %ymm1, %ymm0
+; AVX-NEXT:    pandn %ymm3, %ymm2
+; AVX-NEXT:    por %ymm0, %ymm2
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vmovaps %ymm2, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v16f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm5, %ymm5
+; AVX2-NEXT:    vmovd %eax, %ymm5
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    pand %ymm0, %ymm5
+; AVX2-NEXT:    pandn %ymm2, %ymm4
+; AVX2-NEXT:    por %ymm5, %ymm4
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX2-NEXT:    pand %ymm1, %ymm0
+; AVX2-NEXT:    pandn %ymm3, %ymm2
+; AVX2-NEXT:    por %ymm0, %ymm2
+; AVX2-NEXT:    vmovaps %ymm4, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v16f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB9_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %zmm0, %zmm1
+; AVX512-NEXT:  .LBB9_2:
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %result = call <16 x float> @llvm.ct.select.v16f32(i1 %cond, <16 x float> %a, <16 x float> %b)
+  ret <16 x float> %result
+}
+
+define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pandn %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm7, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm8, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    movaps %xmm5, %xmm2
+; SSE2-NEXT:    movaps %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v8i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm5, %ymm5
+; AVX-NEXT:    vmovd %eax, %ymm5
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX-NEXT:    pand %ymm0, %ymm5
+; AVX-NEXT:    pandn %ymm2, %ymm4
+; AVX-NEXT:    por %ymm5, %ymm4
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm0, %ymm0
+; AVX-NEXT:    vmovd %eax, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX-NEXT:    pand %ymm1, %ymm0
+; AVX-NEXT:    pandn %ymm3, %ymm2
+; AVX-NEXT:    por %ymm0, %ymm2
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vmovaps %ymm2, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm5, %ymm5
+; AVX2-NEXT:    vmovd %eax, %ymm5
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    pand %ymm0, %ymm5
+; AVX2-NEXT:    pandn %ymm2, %ymm4
+; AVX2-NEXT:    por %ymm5, %ymm4
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %ymm0
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX2-NEXT:    pand %ymm1, %ymm0
+; AVX2-NEXT:    pandn %ymm3, %ymm2
+; AVX2-NEXT:    por %ymm0, %ymm2
+; AVX2-NEXT:    vmovaps %ymm4, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB10_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %zmm0, %zmm1
+; AVX512-NEXT:  .LBB10_2:
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %result = call <8 x i64> @llvm.ct.select.v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b)
+  ret <8 x i64> %result
+}
+
+define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) {
+; SSE2-LABEL: test_ctselect_v8f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pandn %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm7, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm8, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    movaps %xmm5, %xmm2
+; SSE2-NEXT:    movaps %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v8f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm5, %ymm5
+; AVX-NEXT:    vmovd %eax, %ymm5
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX-NEXT:    pand %ymm0, %ymm5
+; AVX-NEXT:    pandn %ymm2, %ymm4
+; AVX-NEXT:    por %ymm5, %ymm4
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm0, %ymm0
+; AVX-NEXT:    vmovd %eax, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX-NEXT:    pand %ymm1, %ymm0
+; AVX-NEXT:    pandn %ymm3, %ymm2
+; AVX-NEXT:    por %ymm0, %ymm2
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vmovaps %ymm2, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v8f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm5, %ymm5
+; AVX2-NEXT:    vmovd %eax, %ymm5
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    pand %ymm0, %ymm5
+; AVX2-NEXT:    pandn %ymm2, %ymm4
+; AVX2-NEXT:    por %ymm5, %ymm4
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %ymm0
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX2-NEXT:    pand %ymm1, %ymm0
+; AVX2-NEXT:    pandn %ymm3, %ymm2
+; AVX2-NEXT:    por %ymm0, %ymm2
+; AVX2-NEXT:    vmovaps %ymm4, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v8f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB11_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovapd %zmm0, %zmm1
+; AVX512-NEXT:  .LBB11_2:
+; AVX512-NEXT:    vmovapd %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %result = call <8 x double> @llvm.ct.select.v8f64(i1 %cond, <8 x double> %a, <8 x double> %b)
+  ret <8 x double> %result
+}
+
+; Test with constant conditions for vector types
+define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_const_true:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i32_const_true:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movb $1, %al
+; AVX-NEXT:    testb %al, %al
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_const_true:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    testb %al, %al
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i32_const_true:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    retq
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_const_false:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i32_const_false:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    testb %al, %al
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_const_false:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    testb %al, %al
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i32_const_false:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 false, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Test with comparison conditions for vector types
+define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_icmp:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    cmpl %esi, %edi
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i32_icmp:
+; AVX:       # %bb.0:
+; AVX-NEXT:    cmpl %esi, %edi
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    testb %al, %al
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_icmp:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    cmpl %esi, %edi
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    testb %al, %al
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i32_icmp:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    cmpl %esi, %edi
+; AVX512-NEXT:    je .LBB14_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:  .LBB14_2:
+; AVX512-NEXT:    retq
+  %cond = icmp eq i32 %x, %y
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Declare the intrinsics
+declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
+declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
+declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
+declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
+declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.ct.select.v8f32(i1, <8 x float>, <8 x float>)
+declare <4 x i64> @llvm.ct.select.v4i64(i1, <4 x i64>, <4 x i64>)
+declare <4 x double> @llvm.ct.select.v4f64(i1, <4 x double>, <4 x double>)
+declare <16 x i32> @llvm.ct.select.v16i32(i1, <16 x i32>, <16 x i32>)
+declare <16 x float> @llvm.ct.select.v16f32(i1, <16 x float>, <16 x float>)
+declare <8 x i64> @llvm.ct.select.v8i64(i1, <8 x i64>, <8 x i64>)
+declare <8 x double> @llvm.ct.select.v8f64(i1, <8 x double>, <8 x double>)
diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
index e1abae80cef4f..d76ae0365f28c 100644
--- a/llvm/test/CodeGen/X86/ctselect.ll
+++ b/llvm/test/CodeGen/X86/ctselect.ll
@@ -8,77 +8,122 @@
 define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
 ; X64-LABEL: test_ctselect_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andb $1, %al
-; X64-NEXT:    xorl %edx, %esi
-; X64-NEXT:    negb %al
-; X64-NEXT:    andb %sil, %al
-; X64-NEXT:    xorb %dl, %al
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i8:
 ; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andb $1, %al
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorb %cl, %dl
-; X32-NEXT:    negb %al
-; X32-NEXT:    andb %dl, %al
-; X32-NEXT:    xorb %cl, %al
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    # kill: def $al killed $al killed $eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_i8:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andb $1, %al
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorb %cl, %dl
-; X32-NOCMOV-NEXT:    negb %al
-; X32-NOCMOV-NEXT:    andb %dl, %al
-; X32-NOCMOV-NEXT:    xorb %cl, %al
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %ah
+; X32-NOCMOV-NEXT:    movb %ah, %ch
+; X32-NOCMOV-NEXT:    negb %ch
+; X32-NOCMOV-NEXT:    movb %dl, %al
+; X32-NOCMOV-NEXT:    andb %ch, %al
+; X32-NOCMOV-NEXT:    notb %ch
+; X32-NOCMOV-NEXT:    andb %cl, %ch
+; X32-NOCMOV-NEXT:    orb %ch, %al
 ; X32-NOCMOV-NEXT:    retl
   %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
   ret i8 %result
 }
 
+define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+; X64-LABEL: test_ctselect_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i16:
+; X32:       # %bb.0:
+; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnew {{[0-9]+}}(%esp), %ax
+; X32-NEXT:    retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_i16:
+; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbw %bh, %si
+; X32-NOCMOV-NEXT:    negw %si
+; X32-NOCMOV-NEXT:    movw %dx, %ax
+; X32-NOCMOV-NEXT:    andw %si, %ax
+; X32-NOCMOV-NEXT:    notw %si
+; X32-NOCMOV-NEXT:    andw %cx, %si
+; X32-NOCMOV-NEXT:    orw %si, %ax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT:    retl
+  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %result
+}
+
 define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorl %edx, %esi
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    negl %eax
-; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    xorl %edx, %eax
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i32:
 ; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andb $1, %al
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_i32:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andb $1, %al
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    movzbl %al, %eax
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
@@ -87,67 +132,66 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
 define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 ; X64-LABEL: test_ctselect_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorq %rdx, %rsi
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    negq %rax
-; X64-NEXT:    andq %rsi, %rax
-; X64-NEXT:    xorq %rdx, %rax
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i64:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 12
-; X32-NEXT:    .cfi_offset %esi, -12
-; X32-NEXT:    .cfi_offset %edi, -8
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    andb $1, %dl
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    xorl %esi, %eax
-; X32-NEXT:    movzbl %dl, %edi
-; X32-NEXT:    negl %edi
-; X32-NEXT:    andl %edi, %eax
-; X32-NEXT:    xorl %esi, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    andl %edi, %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    popl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    popl %edi
-; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_i64:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    pushl %ebp
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    pushl %ebx
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
-; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    andb $1, %dl
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    xorl %esi, %eax
-; X32-NOCMOV-NEXT:    movzbl %dl, %edi
-; X32-NOCMOV-NEXT:    negl %edi
-; X32-NOCMOV-NEXT:    andl %edi, %eax
-; X32-NOCMOV-NEXT:    xorl %esi, %eax
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    andl %edi, %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    testb $1, %bl
+; X32-NOCMOV-NEXT:    sete %bh
+; X32-NOCMOV-NEXT:    movb %bh, %cl
+; X32-NOCMOV-NEXT:    movzbl %cl, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ebp, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    testb $1, %bl
+; X32-NOCMOV-NEXT:    sete %cl
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    movb %cl, %ch
+; X32-NOCMOV-NEXT:    movzbl %ch, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edi, %edx
+; X32-NOCMOV-NEXT:    andl %esi, %edx
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ebx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %edx
 ; X32-NOCMOV-NEXT:    popl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
 ; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebp
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
@@ -157,59 +201,74 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
 ; X64-LABEL: test_ctselect_f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm1, %eax
-; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    movd %xmm0, %ecx
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    negl %edi
-; X64-NEXT:    andl %ecx, %edi
-; X64-NEXT:    xorl %eax, %edi
-; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_f32:
 ; X32:       # %bb.0:
-; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %eax
 ; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NEXT:    fstps {{[0-9]+}}(%esp)
-; X32-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NEXT:    fstps (%esp)
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andb $1, %al
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    sete %al
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    negl %eax
-; X32-NEXT:    movl (%esp), %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    andl %eax, %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_f32:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    subl $12, %esp
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %eax
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    fstps {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    fstps (%esp)
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andb $1, %al
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %al
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movzbl %al, %eax
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    movl (%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    andl %eax, %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    addl $12, %esp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movb %al, %ah
+; X32-NOCMOV-NEXT:    movzbl %ah, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %ecx, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    movl %esi, (%esp)
+; X32-NOCMOV-NEXT:    flds (%esp)
+; X32-NOCMOV-NEXT:    addl $4, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
@@ -219,72 +278,96 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
 define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
 ; X64-LABEL: test_ctselect_f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movq %xmm1, %rax
-; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    movq %xmm0, %rcx
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    negq %rdi
-; X64-NEXT:    andq %rcx, %rdi
-; X64-NEXT:    xorq %rax, %rdi
-; X64-NEXT:    movq %rdi, %xmm0
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    movq %xmm1, %rcx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    movq %rcx, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_f64:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    .cfi_offset %esi, -8
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    sete %al
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    xorl %edx, %esi
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl %ecx, %esi
-; X32-NEXT:    xorl %edx, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %eax, %edx
-; X32-NEXT:    andl %ecx, %edx
-; X32-NEXT:    xorl %eax, %edx
-; X32-NEXT:    movl %edx, (%esp)
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fldl (%esp)
 ; X32-NEXT:    addl $8, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_f64:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    pushl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NOCMOV-NEXT:    subl $8, %esp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -8
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %al
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    xorl %edx, %esi
-; X32-NOCMOV-NEXT:    andl $1, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl %ecx, %esi
-; X32-NOCMOV-NEXT:    xorl %edx, %esi
-; X32-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    movb %al, %ah
+; X32-NOCMOV-NEXT:    movzbl %ah, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %ecx, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    movl %esi, (%esp)
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %eax, %edx
-; X32-NOCMOV-NEXT:    andl %ecx, %edx
-; X32-NOCMOV-NEXT:    xorl %eax, %edx
-; X32-NOCMOV-NEXT:    movl %edx, (%esp)
+; X32-NOCMOV-NEXT:    movb %al, %ah
+; X32-NOCMOV-NEXT:    movzbl %ah, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %ecx, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X32-NOCMOV-NEXT:    fldl (%esp)
 ; X32-NOCMOV-NEXT:    addl $8, %esp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
@@ -294,38 +377,42 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
 define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
 ; X64-LABEL: test_ctselect_ptr:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorq %rdx, %rsi
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    negq %rax
-; X64-NEXT:    andq %rsi, %rax
-; X64-NEXT:    xorq %rdx, %rax
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_ptr:
 ; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andb $1, %al
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_ptr:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andb $1, %al
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    movzbl %al, %eax
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
   ret ptr %result
@@ -335,17 +422,45 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
 define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_const_true:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movb $1, %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edi, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_const_true:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movb $1, %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_const_true:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movb $1, %al
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
   ret i32 %result
@@ -355,16 +470,44 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_const_false:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edi, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_const_false:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %ecx, %ecx
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_const_false:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    xorl %eax, %eax
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
   ret i32 %result
@@ -374,1151 +517,429 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
 define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_icmp_eq:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    sete %al
-; X64-NEXT:    xorl %ecx, %edx
-; X64-NEXT:    negl %eax
-; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    xorl %ecx, %eax
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_icmp_eq:
 ; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    sete %al
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    sete %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_icmp_eq:
 ; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    sete %al
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %cond = icmp eq i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
 }
 
-define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
-; X64-LABEL: test_ctselect_icmp_ult:
+define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_ne:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %edx
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    xorl %ecx, %eax
+; X64-NEXT:    setne %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: test_ctselect_icmp_ult:
+; X32-LABEL: test_ctselect_icmp_ne:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    xorl %edx, %edx
-; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    sbbl %edx, %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    xorl %ecx, %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    setne %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
-; X32-NOCMOV-LABEL: test_ctselect_icmp_ult:
+; X32-NOCMOV-LABEL: test_ctselect_icmp_ne:
 ; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    xorl %edx, %edx
 ; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    sbbl %edx, %edx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    setne %al
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
-  %cond = icmp ult i32 %x, %y
+  %cond = icmp ne i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
 }
 
-define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
-; X64-LABEL: test_ctselect_fcmp_oeq:
+define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_slt:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpeqss %xmm1, %xmm0
-; X64-NEXT:    xorps %xmm3, %xmm2
-; X64-NEXT:    andps %xmm2, %xmm0
-; X64-NEXT:    xorps %xmm3, %xmm0
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    setl %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: test_ctselect_fcmp_oeq:
+; X32-LABEL: test_ctselect_icmp_slt:
 ; X32:       # %bb.0:
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NEXT:    fstps {{[0-9]+}}(%esp)
-; X32-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NEXT:    fstps (%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NEXT:    fucompi %st(1), %st
-; X32-NEXT:    fstp %st(0)
-; X32-NEXT:    setnp %cl
-; X32-NEXT:    sete %dl
-; X32-NEXT:    andb %cl, %dl
-; X32-NEXT:    movzbl %dl, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    movl (%esp), %edx
-; X32-NEXT:    xorl %eax, %edx
-; X32-NEXT:    andl %ecx, %edx
-; X32-NEXT:    xorl %eax, %edx
-; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NEXT:    addl $12, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    setl %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
-; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq:
+; X32-NOCMOV-LABEL: test_ctselect_icmp_slt:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    subl $12, %esp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    fstps {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    fstps (%esp)
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    fucompp
-; X32-NOCMOV-NEXT:    fnstsw %ax
-; X32-NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
-; X32-NOCMOV-NEXT:    sahf
-; X32-NOCMOV-NEXT:    setnp %al
-; X32-NOCMOV-NEXT:    sete %dl
-; X32-NOCMOV-NEXT:    andb %al, %dl
-; X32-NOCMOV-NEXT:    movzbl %dl, %eax
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    movl (%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    andl %eax, %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    addl $12, %esp
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    setl %al
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
-  %cond = fcmp oeq float %x, %y
-  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
-  ret float %result
+  %cond = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
 }
 
-; Test with memory operands
-define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
-; X64-LABEL: test_ctselect_load:
+define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_ult:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdx), %ecx
-; X64-NEXT:    movl (%rsi), %eax
-; X64-NEXT:    xorl %ecx, %eax
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    negl %edi
-; X64-NEXT:    andl %edi, %eax
-; X64-NEXT:    xorl %ecx, %eax
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    setb %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: test_ctselect_load:
+; X32-LABEL: test_ctselect_icmp_ult:
 ; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andb $1, %al
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl (%edx), %edx
-; X32-NEXT:    movl (%ecx), %ecx
-; X32-NEXT:    xorl %edx, %ecx
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andl %ecx, %eax
-; X32-NEXT:    xorl %edx, %eax
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    setb %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
-; X32-NOCMOV-LABEL: test_ctselect_load:
+; X32-NOCMOV-LABEL: test_ctselect_icmp_ult:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andb $1, %al
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    movl (%edx), %edx
-; X32-NOCMOV-NEXT:    movl (%ecx), %ecx
-; X32-NOCMOV-NEXT:    xorl %edx, %ecx
-; X32-NOCMOV-NEXT:    movzbl %al, %eax
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andl %ecx, %eax
-; X32-NOCMOV-NEXT:    xorl %edx, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    setb %al
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
-  %a = load i32, ptr %p1
-  %b = load i32, ptr %p2
+  %cond = icmp ult i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
 }
 
-; Test nested ct_select calls
-define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
-; X64-LABEL: test_ctselect_nested:
+define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
+; X64-LABEL: test_ctselect_fcmp_oeq:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorl %ecx, %edx
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    xorl %r8d, %ecx
-; X64-NEXT:    xorl %esi, %ecx
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    negl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    xorl %r8d, %eax
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    setnp %dl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb %dl, %sil
+; X64-NEXT:    cmovnel %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: test_ctselect_nested:
+; X32-LABEL: test_ctselect_fcmp_oeq:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NEXT:    pushl %esi
 ; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 16
 ; X32-NEXT:    .cfi_offset %esi, -12
 ; X32-NEXT:    .cfi_offset %edi, -8
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andb $1, %al
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X32-NEXT:    andb $1, %ah
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NEXT:    fucompi %st(1), %st
+; X32-NEXT:    fstp %st(0)
+; X32-NEXT:    setnp %al
+; X32-NEXT:    sete %cl
+; X32-NEXT:    testb %al, %cl
+; X32-NEXT:    sete %al
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    xorl %edx, %esi
+; X32-NEXT:    movb %al, %ah
 ; X32-NEXT:    movzbl %ah, %edi
 ; X32-NEXT:    negl %edi
-; X32-NEXT:    andl %esi, %edi
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    xorl %edi, %edx
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl %ecx, %eax
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NEXT:    popl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
 ;
-; X32-NOCMOV-LABEL: test_ctselect_nested:
+; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq:
 ; X32-NOCMOV:       # %bb.0:
 ; X32-NOCMOV-NEXT:    pushl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NOCMOV-NEXT:    pushl %esi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
 ; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
 ; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andb $1, %al
-; X32-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %ah
-; X32-NOCMOV-NEXT:    andb $1, %ah
+; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    fucompp
+; X32-NOCMOV-NEXT:    fnstsw %ax
+; X32-NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
+; X32-NOCMOV-NEXT:    sahf
+; X32-NOCMOV-NEXT:    setnp %al
+; X32-NOCMOV-NEXT:    sete %cl
+; X32-NOCMOV-NEXT:    testb %al, %cl
+; X32-NOCMOV-NEXT:    sete %al
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    xorl %edx, %esi
+; X32-NOCMOV-NEXT:    movb %al, %ah
 ; X32-NOCMOV-NEXT:    movzbl %ah, %edi
 ; X32-NOCMOV-NEXT:    negl %edi
-; X32-NOCMOV-NEXT:    andl %esi, %edi
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    xorl %edi, %edx
-; X32-NOCMOV-NEXT:    movzbl %al, %eax
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %ecx, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    movl %esi, (%esp)
+; X32-NOCMOV-NEXT:    flds (%esp)
+; X32-NOCMOV-NEXT:    addl $4, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NOCMOV-NEXT:    popl %esi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NOCMOV-NEXT:    popl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
-  %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
-  %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
-  ret i32 %result
-}
-
-; Test nested CT_SELECT pattern with AND merging on i1 values
-; Pattern: ct_select C0, (ct_select C1, X, Y), Y -> ct_select (C0 & C1), X, Y
-; This optimization only applies when selecting between i1 values (boolean logic)
-define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
-; X64-LABEL: test_ctselect_nested_and_i1_to_i32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    xorl %ecx, %edx
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    negl %eax
-; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    xorl %ecx, %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_nested_and_i1_to_i32:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    andb $1, %al
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_nested_and_i1_to_i32:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT:    andb $1, %al
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    movzbl %al, %eax
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
-; X32-NOCMOV-NEXT:    retl
-  %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
-  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner, i1 false)
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
-  ret i32 %result
-}
-
-; Test nested CT_SELECT pattern with OR merging on i1 values
-; Pattern: ct_select C0, X, (ct_select C1, X, Y) -> ct_select (C0 | C1), X, Y
-; This optimization only applies when selecting between i1 values (boolean logic)
-define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
-; X64-LABEL: test_ctselect_nested_or_i1_to_i32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    xorl %ecx, %edx
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    negl %eax
-; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    xorl %ecx, %eax
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_nested_or_i1_to_i32:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    andb $1, %al
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_nested_or_i1_to_i32:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT:    andb $1, %al
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    movzbl %al, %eax
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
-; X32-NOCMOV-NEXT:    retl
-  %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
-  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 true, i1 %inner)
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
-  ret i32 %result
+  %cond = fcmp oeq float %x, %y
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
 }
 
-; Test double nested CT_SELECT with recursive AND merging
-; Pattern: ct_select C0, (ct_select C1, (ct_select C2, X, Y), Y), Y
-;   -> ct_select C0, (ct_select (C1 & C2), X, Y), Y
-;   -> ct_select (C0 & (C1 & C2)), X, Y
-; This tests that the optimization can be applied recursively
-define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) {
-; X64-LABEL: test_ctselect_double_nested_and_i1:
+; Test with memory operands
+define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_load:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    xorl %r8d, %ecx
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    negl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    xorl %r8d, %eax
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel (%rsi), %eax
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: test_ctselect_double_nested_and_i1:
+; X32-LABEL: test_ctselect_load:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    andb $1, %al
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl %ecx, %eax
-; X32-NEXT:    retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_double_nested_and_i1:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT:    andb $1, %al
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
-; X32-NOCMOV-NEXT:    movzbl %al, %eax
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
-; X32-NOCMOV-NEXT:    retl
-  %inner2 = call i1 @llvm.ct.select.i1(i1 %c2, i1 true, i1 false)
-  %inner1 = call i1 @llvm.ct.select.i1(i1 %c1, i1 %inner2, i1 false)
-  %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner1, i1 false)
-  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
-  ret i32 %result
-}
-
-; Vector CT_SELECT Tests
-; ============================================================================
-
-; Test vector CT_SELECT with v4i32 (128-bit vector with single i1 mask)
-; NOW CONSTANT-TIME: Uses bitwise XOR/AND operations instead of branches!
-define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
-; X64-LABEL: test_ctselect_v4i32:
-; X64:       # %bb.0:
-; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    movd %edi, %xmm2
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X64-NEXT:    pslld $31, %xmm2
-; X64-NEXT:    psrad $31, %xmm2
-; X64-NEXT:    pand %xmm2, %xmm0
-; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_v4i32:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 12
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 20
-; X32-NEXT:    .cfi_offset %esi, -20
-; X32-NEXT:    .cfi_offset %edi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %ebx, %edx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    andl $1, %edi
-; X32-NEXT:    negl %edi
-; X32-NEXT:    andl %edi, %edx
-; X32-NEXT:    xorl %ebx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    xorl %ebp, %ebx
-; X32-NEXT:    andl %edi, %ebx
-; X32-NEXT:    xorl %ebp, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    xorl %esi, %ebp
-; X32-NEXT:    andl %edi, %ebp
-; X32-NEXT:    xorl %esi, %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    xorl %ecx, %esi
-; X32-NEXT:    andl %edi, %esi
-; X32-NEXT:    xorl %ecx, %esi
-; X32-NEXT:    movl %esi, 12(%eax)
-; X32-NEXT:    movl %ebp, 8(%eax)
-; X32-NEXT:    movl %ebx, 4(%eax)
-; X32-NEXT:    movl %edx, (%eax)
-; X32-NEXT:    popl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    popl %edi
-; X32-NEXT:    .cfi_def_cfa_offset 12
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 4
-; X32-NEXT:    retl $4
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel (%ecx), %eax
+; X32-NEXT:    retl
 ;
-; X32-NOCMOV-LABEL: test_ctselect_v4i32:
+; X32-NOCMOV-LABEL: test_ctselect_load:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %ebp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NOCMOV-NEXT:    pushl %ebx
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT:    pushl %edi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT:    pushl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
-; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
-; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
-; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ebx, %edx
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT:    andl $1, %edi
-; X32-NOCMOV-NEXT:    negl %edi
-; X32-NOCMOV-NEXT:    andl %edi, %edx
-; X32-NOCMOV-NEXT:    xorl %ebx, %edx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
-; X32-NOCMOV-NEXT:    andl %edi, %ebx
-; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT:    xorl %esi, %ebp
-; X32-NOCMOV-NEXT:    andl %edi, %ebp
-; X32-NOCMOV-NEXT:    xorl %esi, %ebp
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    xorl %ecx, %esi
-; X32-NOCMOV-NEXT:    andl %edi, %esi
-; X32-NOCMOV-NEXT:    xorl %ecx, %esi
-; X32-NOCMOV-NEXT:    movl %esi, 12(%eax)
-; X32-NOCMOV-NEXT:    movl %ebp, 8(%eax)
-; X32-NOCMOV-NEXT:    movl %ebx, 4(%eax)
-; X32-NOCMOV-NEXT:    movl %edx, (%eax)
-; X32-NOCMOV-NEXT:    popl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT:    popl %edi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT:    popl %ebx
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    popl %ebp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
-; X32-NOCMOV-NEXT:    retl $4
-  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
-  ret <4 x i32> %result
-}
-define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
-; X64-LABEL: test_ctselect_v4f32:
-; X64:       # %bb.0:
-; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    movd %edi, %xmm2
-; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X64-NEXT:    pslld $31, %xmm2
-; X64-NEXT:    psrad $31, %xmm2
-; X64-NEXT:    pand %xmm2, %xmm0
-; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_v4f32:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 12
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 20
-; X32-NEXT:    .cfi_offset %esi, -20
-; X32-NEXT:    .cfi_offset %edi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %ebx, %edx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    andl $1, %edi
-; X32-NEXT:    negl %edi
-; X32-NEXT:    andl %edi, %edx
-; X32-NEXT:    xorl %ebx, %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    xorl %ebp, %ebx
-; X32-NEXT:    andl %edi, %ebx
-; X32-NEXT:    xorl %ebp, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    xorl %esi, %ebp
-; X32-NEXT:    andl %edi, %ebp
-; X32-NEXT:    xorl %esi, %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    xorl %ecx, %esi
-; X32-NEXT:    andl %edi, %esi
-; X32-NEXT:    xorl %ecx, %esi
-; X32-NEXT:    movl %esi, 12(%eax)
-; X32-NEXT:    movl %ebp, 8(%eax)
-; X32-NEXT:    movl %ebx, 4(%eax)
-; X32-NEXT:    movl %edx, (%eax)
-; X32-NEXT:    popl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    popl %edi
-; X32-NEXT:    .cfi_def_cfa_offset 12
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 4
-; X32-NEXT:    retl $4
-;
-; X32-NOCMOV-LABEL: test_ctselect_v4f32:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %ebp
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    pushl %ebx
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT:    pushl %edi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
 ; X32-NOCMOV-NEXT:    pushl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
-; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
-; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
-; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ebx, %edx
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT:    andl $1, %edi
-; X32-NOCMOV-NEXT:    negl %edi
-; X32-NOCMOV-NEXT:    andl %edi, %edx
-; X32-NOCMOV-NEXT:    xorl %ebx, %edx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
-; X32-NOCMOV-NEXT:    andl %edi, %ebx
-; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT:    xorl %esi, %ebp
-; X32-NOCMOV-NEXT:    andl %edi, %ebp
-; X32-NOCMOV-NEXT:    xorl %esi, %ebp
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    xorl %ecx, %esi
-; X32-NOCMOV-NEXT:    andl %edi, %esi
-; X32-NOCMOV-NEXT:    xorl %ecx, %esi
-; X32-NOCMOV-NEXT:    movl %esi, 12(%eax)
-; X32-NOCMOV-NEXT:    movl %ebp, 8(%eax)
-; X32-NOCMOV-NEXT:    movl %ebx, 4(%eax)
-; X32-NOCMOV-NEXT:    movl %edx, (%eax)
-; X32-NOCMOV-NEXT:    popl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT:    popl %edi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT:    popl %ebx
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    popl %ebp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
-; X32-NOCMOV-NEXT:    retl $4
-  %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
-  ret <4 x float> %result
-}
-
-define <8 x i32> @test_ctselect_v8i32_avx(i1 %cond, <8 x i32> %a, <8 x i32> %b) {
-; X64-LABEL: test_ctselect_v8i32_avx:
-; X64:       # %bb.0:
-; X64-NEXT:    movd %edi, %xmm4
-; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
-; X64-NEXT:    pslld $31, %xmm4
-; X64-NEXT:    psrad $31, %xmm4
-; X64-NEXT:    movdqa %xmm4, %xmm5
-; X64-NEXT:    pandn %xmm2, %xmm5
-; X64-NEXT:    pand %xmm4, %xmm0
-; X64-NEXT:    por %xmm5, %xmm0
-; X64-NEXT:    pand %xmm4, %xmm1
-; X64-NEXT:    pandn %xmm3, %xmm4
-; X64-NEXT:    por %xmm4, %xmm1
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_v8i32_avx:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 12
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 20
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 28
-; X32-NEXT:    .cfi_offset %esi, -20
-; X32-NEXT:    .cfi_offset %edi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %ecx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    andl $1, %edx
-; X32-NEXT:    negl %edx
-; X32-NEXT:    andl %edx, %ecx
-; X32-NEXT:    xorl %eax, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    xorl %esi, %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl %esi, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    xorl %ebx, %esi
-; X32-NEXT:    andl %edx, %esi
-; X32-NEXT:    xorl %ebx, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    xorl %ebp, %ebx
-; X32-NEXT:    andl %edx, %ebx
-; X32-NEXT:    xorl %ebp, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    xorl %edi, %ebp
-; X32-NEXT:    andl %edx, %ebp
-; X32-NEXT:    xorl %edi, %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    xorl %eax, %edi
-; X32-NEXT:    andl %edx, %edi
-; X32-NEXT:    xorl %eax, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %ecx
-; X32-NEXT:    andl %edx, %ecx
-; X32-NEXT:    xorl %eax, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %eax, 28(%edx)
-; X32-NEXT:    movl %ecx, 24(%edx)
-; X32-NEXT:    movl %edi, 20(%edx)
-; X32-NEXT:    movl %ebp, 16(%edx)
-; X32-NEXT:    movl %ebx, 12(%edx)
-; X32-NEXT:    movl %esi, 8(%edx)
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 4(%edx)
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, (%edx)
-; X32-NEXT:    movl %edx, %eax
-; X32-NEXT:    addl $8, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 20
-; X32-NEXT:    popl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    popl %edi
-; X32-NEXT:    .cfi_def_cfa_offset 12
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 4
-; X32-NEXT:    retl $4
-;
-; X32-NOCMOV-LABEL: test_ctselect_v8i32_avx:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %ebp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    pushl %ebx
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT:    pushl %edi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT:    pushl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
-; X32-NOCMOV-NEXT:    subl $8, %esp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 28
-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
-; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
-; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
-; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %ecx
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    andl $1, %edx
-; X32-NOCMOV-NEXT:    negl %edx
-; X32-NOCMOV-NEXT:    andl %edx, %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %ecx
-; X32-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    xorl %esi, %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl %esi, %eax
-; X32-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    xorl %ebx, %esi
-; X32-NOCMOV-NEXT:    andl %edx, %esi
-; X32-NOCMOV-NEXT:    xorl %ebx, %esi
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
-; X32-NOCMOV-NEXT:    andl %edx, %ebx
-; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT:    xorl %edi, %ebp
-; X32-NOCMOV-NEXT:    andl %edx, %ebp
-; X32-NOCMOV-NEXT:    xorl %edi, %ebp
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT:    xorl %eax, %edi
-; X32-NOCMOV-NEXT:    andl %edx, %edi
-; X32-NOCMOV-NEXT:    xorl %eax, %edi
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %ecx
-; X32-NOCMOV-NEXT:    andl %edx, %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %ecx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    movl %eax, 28(%edx)
-; X32-NOCMOV-NEXT:    movl %ecx, 24(%edx)
-; X32-NOCMOV-NEXT:    movl %edi, 20(%edx)
-; X32-NOCMOV-NEXT:    movl %ebp, 16(%edx)
-; X32-NOCMOV-NEXT:    movl %ebx, 12(%edx)
-; X32-NOCMOV-NEXT:    movl %esi, 8(%edx)
-; X32-NOCMOV-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NOCMOV-NEXT:    movl %eax, 4(%edx)
-; X32-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NOCMOV-NEXT:    movl %eax, (%edx)
+; X32-NOCMOV-NEXT:    movl (%ecx), %ecx
+; X32-NOCMOV-NEXT:    movl (%eax), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
 ; X32-NOCMOV-NEXT:    movl %edx, %eax
-; X32-NOCMOV-NEXT:    addl $8, %esp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
 ; X32-NOCMOV-NEXT:    popl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT:    popl %edi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT:    popl %ebx
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    popl %ebp
+; X32-NOCMOV-NEXT:    popl %ebx
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
-; X32-NOCMOV-NEXT:    retl $4
-  %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b)
-  ret <8 x i32> %result
+; X32-NOCMOV-NEXT:    retl
+  %a = load i32, ptr %p1
+  %b = load i32, ptr %p2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
 }
 
-define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) {
-; X64-LABEL: test_ctselect_v8f32:
+; Test nested ctselect calls
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+; X64-LABEL: test_ctselect_nested:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %edi, %xmm4
-; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
-; X64-NEXT:    pslld $31, %xmm4
-; X64-NEXT:    psrad $31, %xmm4
-; X64-NEXT:    movdqa %xmm4, %xmm5
-; X64-NEXT:    pandn %xmm2, %xmm5
-; X64-NEXT:    pand %xmm4, %xmm0
-; X64-NEXT:    por %xmm5, %xmm0
-; X64-NEXT:    pand %xmm4, %xmm1
-; X64-NEXT:    pandn %xmm3, %xmm4
-; X64-NEXT:    por %xmm4, %xmm1
+; X64-NEXT:    movl %r8d, %eax
+; X64-NEXT:    testb $1, %sil
+; X64-NEXT:    cmovnel %edx, %ecx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
 ; X64-NEXT:    retq
 ;
-; X32-LABEL: test_ctselect_v8f32:
+; X32-LABEL: test_ctselect_nested:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    pushl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 12
-; X32-NEXT:    pushl %edi
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 20
-; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 28
-; X32-NEXT:    .cfi_offset %esi, -20
-; X32-NEXT:    .cfi_offset %edi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %ecx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    andl $1, %edx
-; X32-NEXT:    negl %edx
-; X32-NEXT:    andl %edx, %ecx
-; X32-NEXT:    xorl %eax, %ecx
-; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    xorl %esi, %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl %esi, %eax
-; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    xorl %ebx, %esi
-; X32-NEXT:    andl %edx, %esi
-; X32-NEXT:    xorl %ebx, %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT:    xorl %ebp, %ebx
-; X32-NEXT:    andl %edx, %ebx
-; X32-NEXT:    xorl %ebp, %ebx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT:    xorl %edi, %ebp
-; X32-NEXT:    andl %edx, %ebp
-; X32-NEXT:    xorl %edi, %ebp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT:    xorl %eax, %edi
-; X32-NEXT:    andl %edx, %edi
-; X32-NEXT:    xorl %eax, %edi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %ecx
-; X32-NEXT:    andl %edx, %ecx
-; X32-NEXT:    xorl %eax, %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl %edx, %eax
-; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl %eax, 28(%edx)
-; X32-NEXT:    movl %ecx, 24(%edx)
-; X32-NEXT:    movl %edi, 20(%edx)
-; X32-NEXT:    movl %ebp, 16(%edx)
-; X32-NEXT:    movl %ebx, 12(%edx)
-; X32-NEXT:    movl %esi, 8(%edx)
-; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, 4(%edx)
-; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT:    movl %eax, (%edx)
-; X32-NEXT:    movl %edx, %eax
-; X32-NEXT:    addl $8, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 20
-; X32-NEXT:    popl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    popl %edi
-; X32-NEXT:    .cfi_def_cfa_offset 12
-; X32-NEXT:    popl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 4
-; X32-NEXT:    retl $4
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    retl
 ;
-; X32-NOCMOV-LABEL: test_ctselect_v8f32:
+; X32-NOCMOV-LABEL: test_ctselect_nested:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %ebp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NOCMOV-NEXT:    pushl %ebx
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NOCMOV-NEXT:    pushl %edi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NOCMOV-NEXT:    pushl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
-; X32-NOCMOV-NEXT:    subl $8, %esp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 28
-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
-; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
-; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
-; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %ecx
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    andl $1, %edx
-; X32-NOCMOV-NEXT:    negl %edx
-; X32-NOCMOV-NEXT:    andl %edx, %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %ecx
-; X32-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    xorl %esi, %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl %esi, %eax
-; X32-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    xorl %ebx, %esi
-; X32-NOCMOV-NEXT:    andl %edx, %esi
-; X32-NOCMOV-NEXT:    xorl %ebx, %esi
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
-; X32-NOCMOV-NEXT:    andl %edx, %ebx
-; X32-NOCMOV-NEXT:    xorl %ebp, %ebx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT:    xorl %edi, %ebp
-; X32-NOCMOV-NEXT:    andl %edx, %ebp
-; X32-NOCMOV-NEXT:    xorl %edi, %ebp
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT:    xorl %eax, %edi
-; X32-NOCMOV-NEXT:    andl %edx, %edi
-; X32-NOCMOV-NEXT:    xorl %eax, %edi
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %ecx
-; X32-NOCMOV-NEXT:    andl %edx, %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andl %edx, %eax
-; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    movl %eax, 28(%edx)
-; X32-NOCMOV-NEXT:    movl %ecx, 24(%edx)
-; X32-NOCMOV-NEXT:    movl %edi, 20(%edx)
-; X32-NOCMOV-NEXT:    movl %ebp, 16(%edx)
-; X32-NOCMOV-NEXT:    movl %ebx, 12(%edx)
-; X32-NOCMOV-NEXT:    movl %esi, 8(%edx)
-; X32-NOCMOV-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X32-NOCMOV-NEXT:    movl %eax, 4(%edx)
-; X32-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NOCMOV-NEXT:    movl %eax, (%edx)
-; X32-NOCMOV-NEXT:    movl %edx, %eax
-; X32-NOCMOV-NEXT:    addl $8, %esp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %eax, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %dl
+; X32-NOCMOV-NEXT:    movb %dl, %dh
+; X32-NOCMOV-NEXT:    movzbl %dh, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %ecx, %eax
+; X32-NOCMOV-NEXT:    andl %edi, %eax
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %esi, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %eax
 ; X32-NOCMOV-NEXT:    popl %esi
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT:    popl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT:    popl %ebx
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    popl %ebp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
-; X32-NOCMOV-NEXT:    retl $4
-  %result = call <8 x float> @llvm.ct.select.v8f32(i1 %cond, <8 x float> %a, <8 x float> %b)
-  ret <8 x float> %result
-}
-
-define float @test_ctselect_f32_nan_inf(i1 %cond) {
-; X64-LABEL: test_ctselect_f32_nan_inf:
-; X64:       # %bb.0:
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    negl %edi
-; X64-NEXT:    andl $4194304, %edi # imm = 0x400000
-; X64-NEXT:    orl $2139095040, %edi # imm = 0x7F800000
-; X64-NEXT:    movd %edi, %xmm0
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_f32_nan_inf:
-; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andb $1, %al
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andl $4194304, %eax # imm = 0x400000
-; X32-NEXT:    orl $2139095040, %eax # imm = 0x7F800000
-; X32-NEXT:    movl %eax, (%esp)
-; X32-NEXT:    flds (%esp)
-; X32-NEXT:    popl %eax
-; X32-NEXT:    .cfi_def_cfa_offset 4
-; X32-NEXT:    retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_f32_nan_inf:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %eax
+; X32-NOCMOV-NEXT:    popl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andb $1, %al
-; X32-NOCMOV-NEXT:    movzbl %al, %eax
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andl $4194304, %eax # imm = 0x400000
-; X32-NOCMOV-NEXT:    orl $2139095040, %eax # imm = 0x7F800000
-; X32-NOCMOV-NEXT:    movl %eax, (%esp)
-; X32-NOCMOV-NEXT:    flds (%esp)
-; X32-NOCMOV-NEXT:    popl %eax
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
-; X32-NOCMOV-NEXT:    retl
-  %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
-  ret float %result
-}
-
-define double @test_ctselect_f64_nan_inf(i1 %cond) {
-; X64-LABEL: test_ctselect_f64_nan_inf:
-; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    negq %rdi
-; X64-NEXT:    movabsq $2251799813685248, %rax # imm = 0x8000000000000
-; X64-NEXT:    andq %rdi, %rax
-; X64-NEXT:    movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    movq %rcx, %xmm0
-; X64-NEXT:    retq
-;
-; X32-LABEL: test_ctselect_f64_nan_inf:
-; X32:       # %bb.0:
-; X32-NEXT:    subl $12, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andl $524288, %eax # imm = 0x80000
-; X32-NEXT:    orl $2146435072, %eax # imm = 0x7FF00000
-; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X32-NEXT:    movl $0, (%esp)
-; X32-NEXT:    fldl (%esp)
-; X32-NEXT:    addl $12, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 4
-; X32-NEXT:    retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_f64_nan_inf:
-; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    subl $12, %esp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andl $524288, %eax # imm = 0x80000
-; X32-NOCMOV-NEXT:    orl $2146435072, %eax # imm = 0x7FF00000
-; X32-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT:    movl $0, (%esp)
-; X32-NOCMOV-NEXT:    fldl (%esp)
-; X32-NOCMOV-NEXT:    addl $12, %esp
+; X32-NOCMOV-NEXT:    popl %ebx
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
-  %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000)
-  ret double %result
+  %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+  %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
+  ret i32 %result
 }
 
 ; Declare the intrinsics
-declare i1 @llvm.ct.select.i1(i1, i1, i1)
 declare i8 @llvm.ct.select.i8(i1, i8, i8)
 declare i16 @llvm.ct.select.i16(i1, i16, i16)
 declare i32 @llvm.ct.select.i32(i1, i32, i32)
@@ -1526,13 +947,3 @@ declare i64 @llvm.ct.select.i64(i1, i64, i64)
 declare float @llvm.ct.select.f32(i1, float, float)
 declare double @llvm.ct.select.f64(i1, double, double)
 declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
-
-; Vector intrinsics
-declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
-declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>)
-declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>)
-declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
-declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
-declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)
-declare <8 x float> @llvm.ct.select.v8f32(i1, <8 x float>, <8 x float>)
diff --git a/nasty-fix-constant.patch b/nasty-fix-constant.patch
new file mode 100644
index 0000000000000..07314e7f6985e
--- /dev/null
+++ b/nasty-fix-constant.patch
@@ -0,0 +1,2994 @@
+diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+index 81f992678626..fb25ab82a452 100644
+--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
++++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+@@ -4369,14 +4369,39 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
+                                                 Node->getFlags()));
+     } else {
+       assert(VT.isInteger());
+-      EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+-      auto [Tmp2Lo, Tmp2Hi] = DAG.SplitScalar(Tmp2, dl, HalfVT, HalfVT);
+-      auto [Tmp3Lo, Tmp3Hi] = DAG.SplitScalar(Tmp3, dl, HalfVT, HalfVT);
+-      SDValue ResLo =
+-          DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Lo, Tmp3Lo, Node->getFlags());
+-      SDValue ResHi =
+-          DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Hi, Tmp3Hi, Node->getFlags());
+-      Tmp1 = DAG.getNode(ISD::BUILD_PAIR, dl, VT, ResLo, ResHi);
++      // Expand scalar integer CT_SELECT to constant-time bitwise operations:
++      //   Mask = 0 - (Cond & 1)      // all-ones or all-zeros
++      //   Result = F ^ ((T ^ F) & Mask)
++      //
++      // By expanding here (during legalization) rather than in
++      // SelectionDAGBuilder, the SETCC feeding the condition has already been
++      // legalized. This prevents visitSIGN_EXTEND in the post-legalization
++      // DAGCombiner from matching sext(setcc) -> select(setcc, -1, 0), which
++      // would convert the constant-time pattern back into a data-dependent
++      // conditional move.
++      //
++      // Note: We cannot use SIGN_EXTEND here because type legalization has
++      // already promoted the i1 condition to the target's SetCC type (e.g.
++      // i32 on MIPS). SIGN_EXTEND(i32, i32) would be a no-op, leaving the
++      // mask as 0/1 instead of 0/-1. Instead, we isolate the low bit and
++      // negate to create a proper all-bits mask. This handles all boolean
++      // content types (ZeroOrOne, ZeroOrNegativeOne, Undefined).
++      SDValue T = Tmp2;
++      SDValue F = Tmp3;
++      // Widen the condition to match VT if needed. Type legalization may
++      // promote the i1 condition to a narrower type than VT (e.g. i32
++      // SetCC result with i64 operands on MIPS64). ANY_EXTEND is safe
++      // because we immediately mask to the low bit.
++      SDValue Cond = Tmp1;
++      if (Cond.getValueType() != VT)
++        Cond = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Cond);
++      SDValue One = DAG.getConstant(1, dl, VT);
++      SDValue Bit = DAG.getNode(ISD::AND, dl, VT, Cond, One);
++      SDValue Zero = DAG.getConstant(0, dl, VT);
++      SDValue Mask = DAG.getNode(ISD::SUB, dl, VT, Zero, Bit);
++      SDValue XorTF = DAG.getNode(ISD::XOR, dl, VT, T, F);
++      SDValue MaskedDiff = DAG.getNode(ISD::AND, dl, VT, XorTF, Mask);
++      Tmp1 = DAG.getNode(ISD::XOR, dl, VT, F, MaskedDiff);
+       Tmp1->setFlags(Node->getFlags());
+     }
+     Results.push_back(Tmp1);
+diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+index 156d82e96b2a..1c68822563ed 100644
+--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
++++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+@@ -6872,9 +6872,41 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
+     // assert if Cond type is Vector
+     assert(!CondVT.isVector() && "Vector type cond not supported yet");
+ 
+-    // Handle scalar types
+-    if (TLI.isOperationLegalOrCustom(ISD::CT_SELECT, VT) &&
+-        !CondVT.isVector()) {
++    // Decide whether to create a CT_SELECT DAG node or use the inline
++    // fallback expansion. CT_SELECT nodes are protected by visitCT_SELECT
++    // in DAGCombiner from unsafe folds (e.g. sext(setcc) -> select) that
++    // break constant-time guarantees.
++    //
++    // We create CT_SELECT when:
++    //  1. Target has Legal/Custom support for this type.
++    //  2. Scalar integer types — type legalization splits wide types (e.g.
++    //     i64 on 32-bit targets) before operation legalization expands to
++    //     AND/OR/XOR.
++    //  3. Scalar float types where the integer equivalent is legal — the
++    //     expansion bitcasts to integer for bitwise ops.
++    //
++    // We use the inline fallback when:
++    //  - Vector types without target support (Expand) — the legalization
++    //    expansion uses getSplatBuildVector + SIGN_EXTEND of vector i1
++    //    which not all targets support.
++    //  - Float types where the integer equivalent is illegal (e.g. f64 on
++    //    i386 maps to i64, which is illegal). The expansion creates new
++    //    nodes during operation legalization that can't be further
++    //    type-legalized. The inline fallback runs before type legalization
++    //    so the i64 ops get properly split.
++    bool CreateNode;
++    if (TLI.isOperationLegalOrCustom(ISD::CT_SELECT, VT)) {
++      CreateNode = true;
++    } else if (VT.isVector()) {
++      CreateNode = false;
++    } else if (VT.isFloatingPoint()) {
++      EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
++      CreateNode = TLI.isTypeLegal(IntVT);
++    } else {
++      CreateNode = true; // Scalar integer — always safe
++    }
++
++    if (CreateNode) {
+       SDValue Result = DAG.getNode(ISD::CT_SELECT, DL, VT, Cond, A, B);
+       setValue(&I, Result);
+       return;
+diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll
+index f1831a625d4a..401a742c27ea 100644
+--- a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll
++++ b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll
+@@ -8,22 +8,24 @@
+ define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+ ; M32-LABEL: test_ctselect_i1:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    xori $2, $4, 1
+-; M32-NEXT:    and $1, $4, $5
+-; M32-NEXT:    and $2, $2, $6
++; M32-NEXT:    andi $2, $4, 1
++; M32-NEXT:    xor $1, $5, $6
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $1, $2
++; M32-NEXT:    xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_i1:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    sll $2, $4, 0
+-; M64-NEXT:    sll $1, $6, 0
+-; M64-NEXT:    xori $2, $2, 1
+-; M64-NEXT:    and $1, $2, $1
+-; M64-NEXT:    and $2, $4, $5
++; M64-NEXT:    sll $1, $4, 0
++; M64-NEXT:    xor $2, $5, $6
++; M64-NEXT:    andi $1, $1, 1
+ ; M64-NEXT:    sll $2, $2, 0
++; M64-NEXT:    negu $1, $1
++; M64-NEXT:    and $1, $2, $1
++; M64-NEXT:    sll $2, $6, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $2, $1
+   %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+   ret i1 %result
+ }
+@@ -32,30 +34,18 @@ define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+ define i32 @test_ctselect_extremal_values(i1 %cond) {
+ ; M32-LABEL: test_ctselect_extremal_values:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    lui $3, 32767
+ ; M32-NEXT:    andi $1, $4, 1
+-; M32-NEXT:    negu $2, $1
+-; M32-NEXT:    ori $3, $3, 65535
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $2, $3
+-; M32-NEXT:    lui $3, 32768
+-; M32-NEXT:    and $1, $1, $3
++; M32-NEXT:    lui $2, 32768
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    subu $2, $2, $1
+ ;
+ ; M64-LABEL: test_ctselect_extremal_values:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $4, 0
+-; M64-NEXT:    lui $3, 32767
++; M64-NEXT:    lui $2, 32768
+ ; M64-NEXT:    andi $1, $1, 1
+-; M64-NEXT:    ori $3, $3, 65535
+-; M64-NEXT:    negu $2, $1
+-; M64-NEXT:    addiu $1, $1, -1
+-; M64-NEXT:    and $2, $2, $3
+-; M64-NEXT:    lui $3, 32768
+-; M64-NEXT:    and $1, $1, $3
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    subu $2, $2, $1
+   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648)
+   ret i32 %result
+ }
+@@ -67,14 +57,14 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+ ; M32-NEXT:    andi $1, $4, 1
+ ; M32-NEXT:    negu $1, $1
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    and $2, $1, $5
++; M32-NEXT:    and $2, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_null_ptr:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    andi $1, $4, 1
+ ; M64-NEXT:    dnegu $1, $1
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    and $2, $1, $5
++; M64-NEXT:    and $2, $5, $1
+   %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+   ret ptr %result
+ }
+@@ -83,23 +73,21 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+ define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+ ; M32-LABEL: test_ctselect_function_ptr:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    andi $1, $4, 1
+-; M32-NEXT:    negu $2, $1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $2, $5
+-; M32-NEXT:    and $1, $1, $6
++; M32-NEXT:    andi $2, $4, 1
++; M32-NEXT:    xor $1, $5, $6
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_function_ptr:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    andi $1, $4, 1
+-; M64-NEXT:    dnegu $2, $1
+-; M64-NEXT:    daddiu $1, $1, -1
+-; M64-NEXT:    and $2, $2, $5
+-; M64-NEXT:    and $1, $1, $6
++; M64-NEXT:    andi $2, $4, 1
++; M64-NEXT:    xor $1, $5, $6
++; M64-NEXT:    dnegu $2, $2
++; M64-NEXT:    and $1, $1, $2
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $6, $1
+   %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+   ret ptr %result
+ }
+@@ -108,26 +96,25 @@ define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+ define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+ ; M32-LABEL: test_ctselect_ptr_cmp:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    xor $1, $4, $5
+-; M32-NEXT:    sltu $1, $zero, $1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $1, $6
+-; M32-NEXT:    not $1, $1
+-; M32-NEXT:    and $1, $1, $7
++; M32-NEXT:    xor $2, $4, $5
++; M32-NEXT:    xor $1, $6, $7
++; M32-NEXT:    sltiu $2, $2, 1
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $7, $1
+ ;
+ ; M64-LABEL: test_ctselect_ptr_cmp:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    xor $1, $4, $5
+-; M64-NEXT:    daddiu $3, $zero, -1
+-; M64-NEXT:    daddiu $2, $zero, -1
+-; M64-NEXT:    movn $3, $zero, $1
+-; M64-NEXT:    xor $2, $3, $2
+-; M64-NEXT:    and $1, $3, $6
+-; M64-NEXT:    and $2, $2, $7
++; M64-NEXT:    xor $2, $4, $5
++; M64-NEXT:    xor $1, $6, $7
++; M64-NEXT:    sltiu $2, $2, 1
++; M64-NEXT:    dsll $2, $2, 32
++; M64-NEXT:    dsrl $2, $2, 32
++; M64-NEXT:    dnegu $2, $2
++; M64-NEXT:    and $1, $1, $2
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $1, $2
++; M64-NEXT:    xor $2, $7, $1
+   %cmp = icmp eq ptr %p1, %p2
+   %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b)
+   ret ptr %result
+@@ -139,23 +126,21 @@ define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+ ; M32-LABEL: test_ctselect_struct_ptr:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    andi $1, $4, 1
+-; M32-NEXT:    negu $2, $1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $2, $5
+-; M32-NEXT:    and $1, $1, $6
++; M32-NEXT:    andi $2, $4, 1
++; M32-NEXT:    xor $1, $5, $6
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_struct_ptr:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    andi $1, $4, 1
+-; M64-NEXT:    dnegu $2, $1
+-; M64-NEXT:    daddiu $1, $1, -1
+-; M64-NEXT:    and $2, $2, $5
+-; M64-NEXT:    and $1, $1, $6
++; M64-NEXT:    andi $2, $4, 1
++; M64-NEXT:    xor $1, $5, $6
++; M64-NEXT:    dnegu $2, $2
++; M64-NEXT:    and $1, $1, $2
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $6, $1
+   %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+   ret ptr %result
+ }
+@@ -164,73 +149,65 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+ ; M32-LABEL: test_ctselect_deeply_nested:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    andi $1, $4, 1
+-; M32-NEXT:    lw $3, 16($sp)
+-; M32-NEXT:    lw $9, 32($sp)
+-; M32-NEXT:    lw $8, 28($sp)
+-; M32-NEXT:    negu $2, $1
+-; M32-NEXT:    addiu $1, $1, -1
++; M32-NEXT:    lw $1, 20($sp)
++; M32-NEXT:    lw $2, 16($sp)
++; M32-NEXT:    andi $3, $4, 1
++; M32-NEXT:    andi $4, $6, 1
++; M32-NEXT:    lw $6, 28($sp)
++; M32-NEXT:    negu $3, $3
++; M32-NEXT:    xor $2, $2, $1
+ ; M32-NEXT:    and $2, $2, $3
+-; M32-NEXT:    lw $3, 20($sp)
+-; M32-NEXT:    and $1, $1, $3
+ ; M32-NEXT:    andi $3, $5, 1
+-; M32-NEXT:    or $1, $2, $1
+-; M32-NEXT:    andi $2, $6, 1
+-; M32-NEXT:    andi $6, $7, 1
+-; M32-NEXT:    negu $4, $3
+-; M32-NEXT:    addiu $3, $3, -1
+-; M32-NEXT:    addiu $7, $6, -1
+-; M32-NEXT:    and $1, $4, $1
+-; M32-NEXT:    addiu $5, $2, -1
+-; M32-NEXT:    negu $2, $2
+-; M32-NEXT:    negu $6, $6
+-; M32-NEXT:    and $4, $7, $9
+-; M32-NEXT:    lw $7, 24($sp)
+-; M32-NEXT:    and $5, $5, $8
+-; M32-NEXT:    and $3, $3, $7
+-; M32-NEXT:    or $1, $1, $3
+-; M32-NEXT:    and $1, $2, $1
+-; M32-NEXT:    or $1, $1, $5
+-; M32-NEXT:    and $1, $6, $1
++; M32-NEXT:    lw $5, 32($sp)
++; M32-NEXT:    xor $1, $1, $2
++; M32-NEXT:    lw $2, 24($sp)
++; M32-NEXT:    negu $3, $3
++; M32-NEXT:    xor $1, $1, $2
++; M32-NEXT:    and $1, $1, $3
++; M32-NEXT:    andi $3, $7, 1
++; M32-NEXT:    xor $1, $2, $1
++; M32-NEXT:    negu $2, $4
++; M32-NEXT:    negu $3, $3
++; M32-NEXT:    xor $1, $1, $6
++; M32-NEXT:    and $1, $1, $2
++; M32-NEXT:    xor $1, $6, $1
++; M32-NEXT:    xor $1, $1, $5
++; M32-NEXT:    and $1, $1, $3
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $1, $4
++; M32-NEXT:    xor $2, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_deeply_nested:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $4, 0
+-; M64-NEXT:    sll $3, $8, 0
+-; M64-NEXT:    sll $4, $5, 0
+-; M64-NEXT:    lw $8, 0($sp)
++; M64-NEXT:    xor $2, $8, $9
++; M64-NEXT:    sll $5, $5, 0
++; M64-NEXT:    sll $3, $6, 0
++; M64-NEXT:    sll $6, $11, 0
++; M64-NEXT:    sll $4, $7, 0
++; M64-NEXT:    lw $7, 0($sp)
+ ; M64-NEXT:    andi $1, $1, 1
++; M64-NEXT:    sll $2, $2, 0
++; M64-NEXT:    andi $5, $5, 1
++; M64-NEXT:    andi $3, $3, 1
+ ; M64-NEXT:    andi $4, $4, 1
+-; M64-NEXT:    negu $2, $1
+-; M64-NEXT:    addiu $1, $1, -1
+-; M64-NEXT:    negu $5, $4
+-; M64-NEXT:    addiu $4, $4, -1
+-; M64-NEXT:    and $2, $2, $3
+-; M64-NEXT:    sll $3, $9, 0
+-; M64-NEXT:    and $1, $1, $3
+-; M64-NEXT:    sll $3, $11, 0
+-; M64-NEXT:    or $1, $2, $1
+-; M64-NEXT:    sll $2, $6, 0
+-; M64-NEXT:    sll $6, $7, 0
+-; M64-NEXT:    andi $2, $2, 1
+-; M64-NEXT:    and $1, $5, $1
+-; M64-NEXT:    andi $6, $6, 1
+-; M64-NEXT:    addiu $5, $2, -1
+-; M64-NEXT:    negu $2, $2
+-; M64-NEXT:    addiu $7, $6, -1
+-; M64-NEXT:    negu $6, $6
+-; M64-NEXT:    and $3, $5, $3
+-; M64-NEXT:    sll $5, $10, 0
+-; M64-NEXT:    and $7, $7, $8
+-; M64-NEXT:    and $4, $4, $5
+-; M64-NEXT:    or $1, $1, $4
++; M64-NEXT:    negu $1, $1
++; M64-NEXT:    negu $5, $5
++; M64-NEXT:    negu $4, $4
+ ; M64-NEXT:    and $1, $2, $1
+-; M64-NEXT:    or $1, $1, $3
+-; M64-NEXT:    and $1, $6, $1
++; M64-NEXT:    sll $2, $9, 0
++; M64-NEXT:    xor $1, $2, $1
++; M64-NEXT:    sll $2, $10, 0
++; M64-NEXT:    xor $1, $1, $2
++; M64-NEXT:    and $1, $1, $5
++; M64-NEXT:    xor $1, $2, $1
++; M64-NEXT:    negu $2, $3
++; M64-NEXT:    xor $1, $1, $6
++; M64-NEXT:    and $1, $1, $2
++; M64-NEXT:    xor $1, $6, $1
++; M64-NEXT:    xor $1, $1, $7
++; M64-NEXT:    and $1, $1, $4
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $1, $7
++; M64-NEXT:    xor $2, $7, $1
+   %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+   %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+   %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll
+index 2e65e586ce5f..a1c5d524c693 100644
+--- a/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll
++++ b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll
+@@ -6,16 +6,18 @@
+ define i32 @test_ctselect_smin_zero(i32 %x) {
+ ; M32-LABEL: test_ctselect_smin_zero:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    sra $1, $4, 31
++; M32-NEXT:    slti $1, $4, 0
++; M32-NEXT:    negu $1, $1
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    and $2, $1, $4
++; M32-NEXT:    and $2, $4, $1
+ ;
+ ; M64-LABEL: test_ctselect_smin_zero:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $4, 0
+-; M64-NEXT:    sra $2, $1, 31
++; M64-NEXT:    slti $2, $1, 0
++; M64-NEXT:    negu $2, $2
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    and $2, $2, $1
++; M64-NEXT:    and $2, $1, $2
+   %cmp = icmp slt i32 %x, 0
+   %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+   ret i32 %result
+@@ -25,17 +27,18 @@ define i32 @test_ctselect_smin_zero(i32 %x) {
+ define i32 @test_ctselect_smax_zero(i32 %x) {
+ ; M32-LABEL: test_ctselect_smax_zero:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    slti $1, $4, 1
+-; M32-NEXT:    movn $4, $zero, $1
++; M32-NEXT:    slt $1, $zero, $4
++; M32-NEXT:    negu $1, $1
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    move $2, $4
++; M32-NEXT:    and $2, $4, $1
+ ;
+ ; M64-LABEL: test_ctselect_smax_zero:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    sll $2, $4, 0
+-; M64-NEXT:    slti $1, $2, 1
++; M64-NEXT:    sll $1, $4, 0
++; M64-NEXT:    slt $2, $zero, $1
++; M64-NEXT:    negu $2, $2
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    movn $2, $zero, $1
++; M64-NEXT:    and $2, $1, $2
+   %cmp = icmp sgt i32 %x, 0
+   %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+   ret i32 %result
+@@ -45,27 +48,23 @@ define i32 @test_ctselect_smax_zero(i32 %x) {
+ define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+ ; M32-LABEL: test_ctselect_smin_generic:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    slt $1, $4, $5
+-; M32-NEXT:    xori $1, $1, 1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $1, $4
+-; M32-NEXT:    not $1, $1
+-; M32-NEXT:    and $1, $1, $5
++; M32-NEXT:    slt $2, $4, $5
++; M32-NEXT:    xor $1, $4, $5
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_smin_generic:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $5, 0
+ ; M64-NEXT:    sll $2, $4, 0
+-; M64-NEXT:    slt $3, $2, $1
+-; M64-NEXT:    xori $3, $3, 1
+-; M64-NEXT:    addiu $3, $3, -1
++; M64-NEXT:    xor $3, $2, $1
++; M64-NEXT:    slt $2, $2, $1
++; M64-NEXT:    negu $2, $2
+ ; M64-NEXT:    and $2, $3, $2
+-; M64-NEXT:    not $3, $3
+-; M64-NEXT:    and $1, $3, $1
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $1, $2
+   %cmp = icmp slt i32 %x, %y
+   %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+   ret i32 %result
+@@ -75,27 +74,23 @@ define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+ define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+ ; M32-LABEL: test_ctselect_smax_generic:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    slt $1, $5, $4
+-; M32-NEXT:    xori $1, $1, 1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $1, $4
+-; M32-NEXT:    not $1, $1
+-; M32-NEXT:    and $1, $1, $5
++; M32-NEXT:    slt $2, $5, $4
++; M32-NEXT:    xor $1, $4, $5
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_smax_generic:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    sll $1, $4, 0
+-; M64-NEXT:    sll $2, $5, 0
+-; M64-NEXT:    slt $3, $2, $1
+-; M64-NEXT:    xori $3, $3, 1
+-; M64-NEXT:    addiu $3, $3, -1
+-; M64-NEXT:    and $1, $3, $1
+-; M64-NEXT:    not $3, $3
++; M64-NEXT:    sll $1, $5, 0
++; M64-NEXT:    sll $2, $4, 0
++; M64-NEXT:    xor $3, $2, $1
++; M64-NEXT:    slt $2, $1, $2
++; M64-NEXT:    negu $2, $2
+ ; M64-NEXT:    and $2, $3, $2
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $1, $2
++; M64-NEXT:    xor $2, $1, $2
+   %cmp = icmp sgt i32 %x, %y
+   %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+   ret i32 %result
+@@ -105,27 +100,23 @@ define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+ define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+ ; M32-LABEL: test_ctselect_umin_generic:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    sltu $1, $4, $5
+-; M32-NEXT:    xori $1, $1, 1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $1, $4
+-; M32-NEXT:    not $1, $1
+-; M32-NEXT:    and $1, $1, $5
++; M32-NEXT:    sltu $2, $4, $5
++; M32-NEXT:    xor $1, $4, $5
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_umin_generic:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $5, 0
+ ; M64-NEXT:    sll $2, $4, 0
+-; M64-NEXT:    sltu $3, $2, $1
+-; M64-NEXT:    xori $3, $3, 1
+-; M64-NEXT:    addiu $3, $3, -1
++; M64-NEXT:    xor $3, $2, $1
++; M64-NEXT:    sltu $2, $2, $1
++; M64-NEXT:    negu $2, $2
+ ; M64-NEXT:    and $2, $3, $2
+-; M64-NEXT:    not $3, $3
+-; M64-NEXT:    and $1, $3, $1
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $1, $2
+   %cmp = icmp ult i32 %x, %y
+   %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+   ret i32 %result
+@@ -135,27 +126,23 @@ define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+ ; M32-LABEL: test_ctselect_umax_generic:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    sltu $1, $5, $4
+-; M32-NEXT:    xori $1, $1, 1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $1, $4
+-; M32-NEXT:    not $1, $1
+-; M32-NEXT:    and $1, $1, $5
++; M32-NEXT:    sltu $2, $5, $4
++; M32-NEXT:    xor $1, $4, $5
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_umax_generic:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    sll $1, $4, 0
+-; M64-NEXT:    sll $2, $5, 0
+-; M64-NEXT:    sltu $3, $2, $1
+-; M64-NEXT:    xori $3, $3, 1
+-; M64-NEXT:    addiu $3, $3, -1
+-; M64-NEXT:    and $1, $3, $1
+-; M64-NEXT:    not $3, $3
++; M64-NEXT:    sll $1, $5, 0
++; M64-NEXT:    sll $2, $4, 0
++; M64-NEXT:    xor $3, $2, $1
++; M64-NEXT:    sltu $2, $1, $2
++; M64-NEXT:    negu $2, $2
+ ; M64-NEXT:    and $2, $3, $2
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $1, $2
++; M64-NEXT:    xor $2, $1, $2
+   %cmp = icmp ugt i32 %x, %y
+   %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+   ret i32 %result
+@@ -165,24 +152,24 @@ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+ define i32 @test_ctselect_abs(i32 %x) {
+ ; M32-LABEL: test_ctselect_abs:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    negu $1, $4
+-; M32-NEXT:    sra $2, $4, 31
++; M32-NEXT:    slti $1, $4, 0
++; M32-NEXT:    negu $2, $4
++; M32-NEXT:    negu $1, $1
++; M32-NEXT:    xor $2, $2, $4
+ ; M32-NEXT:    and $1, $2, $1
+-; M32-NEXT:    not $2, $2
+-; M32-NEXT:    and $2, $2, $4
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $1, $2
++; M32-NEXT:    xor $2, $4, $1
+ ;
+ ; M64-LABEL: test_ctselect_abs:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $4, 0
+-; M64-NEXT:    negu $2, $1
+-; M64-NEXT:    sra $3, $1, 31
++; M64-NEXT:    slti $2, $1, 0
++; M64-NEXT:    negu $3, $1
++; M64-NEXT:    negu $2, $2
++; M64-NEXT:    xor $3, $3, $1
+ ; M64-NEXT:    and $2, $3, $2
+-; M64-NEXT:    not $3, $3
+-; M64-NEXT:    and $1, $3, $1
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $1, $2
+   %neg = sub i32 0, %x
+   %cmp = icmp slt i32 %x, 0
+   %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x)
+@@ -193,24 +180,24 @@ define i32 @test_ctselect_abs(i32 %x) {
+ define i32 @test_ctselect_nabs(i32 %x) {
+ ; M32-LABEL: test_ctselect_nabs:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    sra $1, $4, 31
+-; M32-NEXT:    negu $3, $4
+-; M32-NEXT:    and $2, $1, $4
+-; M32-NEXT:    not $1, $1
+-; M32-NEXT:    and $1, $1, $3
++; M32-NEXT:    slti $1, $4, 0
++; M32-NEXT:    negu $2, $4
++; M32-NEXT:    negu $1, $1
++; M32-NEXT:    xor $3, $4, $2
++; M32-NEXT:    and $1, $3, $1
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $2, $1
+ ;
+ ; M64-LABEL: test_ctselect_nabs:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $4, 0
+-; M64-NEXT:    sra $2, $1, 31
+-; M64-NEXT:    and $3, $2, $1
+-; M64-NEXT:    negu $1, $1
+-; M64-NEXT:    not $2, $2
+-; M64-NEXT:    and $1, $2, $1
++; M64-NEXT:    slti $2, $1, 0
++; M64-NEXT:    negu $3, $1
++; M64-NEXT:    negu $2, $2
++; M64-NEXT:    xor $1, $1, $3
++; M64-NEXT:    and $1, $1, $2
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $3, $1
++; M64-NEXT:    xor $2, $3, $1
+   %neg = sub i32 0, %x
+   %cmp = icmp slt i32 %x, 0
+   %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg)
+@@ -221,14 +208,16 @@ define i32 @test_ctselect_nabs(i32 %x) {
+ define i32 @test_ctselect_sign_extend(i32 %x) {
+ ; M32-LABEL: test_ctselect_sign_extend:
+ ; M32:       # %bb.0:
++; M32-NEXT:    slti $1, $4, 0
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    sra $2, $4, 31
++; M32-NEXT:    negu $2, $1
+ ;
+ ; M64-LABEL: test_ctselect_sign_extend:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $4, 0
++; M64-NEXT:    slti $1, $1, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    sra $2, $1, 31
++; M64-NEXT:    negu $2, $1
+   %cmp = icmp slt i32 %x, 0
+   %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+   ret i32 %result
+@@ -270,13 +259,12 @@ define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_constant_folding_false:
+ ; M32:       # %bb.0:
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $zero, $5
++; M32-NEXT:    move $2, $5
+ ;
+ ; M64-LABEL: test_ctselect_constant_folding_false:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    sll $1, $5, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $zero, $1
++; M64-NEXT:    sll $2, $5, 0
+   %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+   ret i32 %result
+ }
+@@ -285,25 +273,13 @@ define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+ define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+ ; M32-LABEL: test_ctselect_identical_operands:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    andi $1, $4, 1
+-; M32-NEXT:    negu $2, $1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $2, $5
+-; M32-NEXT:    and $1, $1, $5
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    move $2, $5
+ ;
+ ; M64-LABEL: test_ctselect_identical_operands:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    sll $1, $4, 0
+-; M64-NEXT:    sll $3, $5, 0
+-; M64-NEXT:    andi $1, $1, 1
+-; M64-NEXT:    negu $2, $1
+-; M64-NEXT:    addiu $1, $1, -1
+-; M64-NEXT:    and $2, $2, $3
+-; M64-NEXT:    and $1, $1, $3
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    sll $2, $5, 0
+   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x)
+   ret i32 %result
+ }
+@@ -312,29 +288,27 @@ define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+ define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_inverted_condition:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    xor $1, $4, $5
+-; M32-NEXT:    sltiu $1, $1, 1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $1, $6
+-; M32-NEXT:    not $1, $1
+-; M32-NEXT:    and $1, $1, $7
++; M32-NEXT:    xor $2, $4, $5
++; M32-NEXT:    xor $1, $7, $6
++; M32-NEXT:    sltiu $2, $2, 1
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_inverted_condition:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $5, 0
+ ; M64-NEXT:    sll $2, $4, 0
+-; M64-NEXT:    sll $3, $7, 0
+ ; M64-NEXT:    xor $1, $2, $1
+-; M64-NEXT:    sll $2, $6, 0
++; M64-NEXT:    xor $2, $7, $6
+ ; M64-NEXT:    sltiu $1, $1, 1
+-; M64-NEXT:    addiu $1, $1, -1
+-; M64-NEXT:    and $2, $1, $2
+-; M64-NEXT:    not $1, $1
+-; M64-NEXT:    and $1, $1, $3
++; M64-NEXT:    sll $2, $2, 0
++; M64-NEXT:    negu $1, $1
++; M64-NEXT:    and $1, $2, $1
++; M64-NEXT:    sll $2, $6, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $2, $1
+   %cmp = icmp eq i32 %x, %y
+   %not_cmp = xor i1 %cmp, true
+   %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b)
+@@ -345,57 +319,51 @@ define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) {
+ ; M32-LABEL: test_ctselect_chain:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    andi $1, $4, 1
++; M32-NEXT:    lw $1, 16($sp)
++; M32-NEXT:    andi $3, $4, 1
++; M32-NEXT:    negu $3, $3
++; M32-NEXT:    xor $2, $7, $1
++; M32-NEXT:    and $2, $2, $3
+ ; M32-NEXT:    andi $3, $5, 1
+-; M32-NEXT:    lw $5, 16($sp)
+-; M32-NEXT:    negu $2, $1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    negu $4, $3
+-; M32-NEXT:    addiu $3, $3, -1
+-; M32-NEXT:    and $1, $1, $5
+-; M32-NEXT:    and $2, $2, $7
+-; M32-NEXT:    lw $5, 24($sp)
+-; M32-NEXT:    or $1, $2, $1
++; M32-NEXT:    xor $1, $1, $2
++; M32-NEXT:    lw $2, 20($sp)
++; M32-NEXT:    negu $3, $3
++; M32-NEXT:    xor $1, $1, $2
++; M32-NEXT:    and $1, $1, $3
++; M32-NEXT:    lw $3, 24($sp)
++; M32-NEXT:    xor $1, $2, $1
+ ; M32-NEXT:    andi $2, $6, 1
+-; M32-NEXT:    and $1, $4, $1
+-; M32-NEXT:    addiu $4, $2, -1
++; M32-NEXT:    xor $1, $1, $3
+ ; M32-NEXT:    negu $2, $2
+-; M32-NEXT:    and $4, $4, $5
+-; M32-NEXT:    lw $5, 20($sp)
+-; M32-NEXT:    and $3, $3, $5
+-; M32-NEXT:    or $1, $1, $3
+-; M32-NEXT:    and $1, $2, $1
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $1, $4
++; M32-NEXT:    xor $2, $3, $1
+ ;
+ ; M64-LABEL: test_ctselect_chain:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $4, 0
+-; M64-NEXT:    sll $3, $7, 0
+-; M64-NEXT:    sll $4, $5, 0
++; M64-NEXT:    xor $2, $7, $8
++; M64-NEXT:    sll $3, $5, 0
+ ; M64-NEXT:    andi $1, $1, 1
+-; M64-NEXT:    andi $4, $4, 1
+-; M64-NEXT:    negu $2, $1
+-; M64-NEXT:    addiu $1, $1, -1
+-; M64-NEXT:    negu $5, $4
+-; M64-NEXT:    addiu $4, $4, -1
+-; M64-NEXT:    and $2, $2, $3
+-; M64-NEXT:    sll $3, $8, 0
+-; M64-NEXT:    and $1, $1, $3
+-; M64-NEXT:    sll $3, $6, 0
+-; M64-NEXT:    sll $6, $10, 0
+-; M64-NEXT:    or $1, $2, $1
++; M64-NEXT:    sll $2, $2, 0
+ ; M64-NEXT:    andi $3, $3, 1
+-; M64-NEXT:    and $1, $5, $1
+-; M64-NEXT:    sll $5, $9, 0
+-; M64-NEXT:    addiu $2, $3, -1
++; M64-NEXT:    negu $1, $1
+ ; M64-NEXT:    negu $3, $3
+-; M64-NEXT:    and $4, $4, $5
+-; M64-NEXT:    and $2, $2, $6
+-; M64-NEXT:    or $1, $1, $4
+-; M64-NEXT:    and $1, $3, $1
++; M64-NEXT:    and $1, $2, $1
++; M64-NEXT:    sll $2, $8, 0
++; M64-NEXT:    xor $1, $2, $1
++; M64-NEXT:    sll $2, $9, 0
++; M64-NEXT:    xor $1, $1, $2
++; M64-NEXT:    and $1, $1, $3
++; M64-NEXT:    sll $3, $6, 0
++; M64-NEXT:    xor $1, $2, $1
++; M64-NEXT:    andi $2, $3, 1
++; M64-NEXT:    sll $3, $10, 0
++; M64-NEXT:    xor $1, $1, $3
++; M64-NEXT:    negu $2, $2
++; M64-NEXT:    and $1, $1, $2
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $1, $2
++; M64-NEXT:    xor $2, $3, $1
+   %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+   %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+   %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+@@ -406,16 +374,17 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c,
+ define i64 @test_ctselect_i64_smin_zero(i64 %x) {
+ ; M32-LABEL: test_ctselect_i64_smin_zero:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    sra $1, $5, 31
+-; M32-NEXT:    and $2, $1, $4
++; M32-NEXT:    slti $1, $5, 0
++; M32-NEXT:    negu $1, $1
++; M32-NEXT:    and $2, $4, $1
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    and $3, $1, $5
++; M32-NEXT:    and $3, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_i64_smin_zero:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    dsra $1, $4, 63
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    and $2, $1, $4
++; M64-NEXT:    and $2, $4, $1
+   %cmp = icmp slt i64 %x, 0
+   %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0)
+   ret i64 %result
+diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll
+index 6222f6052e12..302e06b0a733 100644
+--- a/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll
++++ b/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll
+@@ -6,21 +6,19 @@
+ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32:
+ ; MIPS64-MSA:       # %bb.0:
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT:    sll $1, $4, 0
+-; MIPS64-MSA-NEXT:    ldi.b $w0, -1
+-; MIPS64-MSA-NEXT:    fill.w $w1, $1
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $8
+-; MIPS64-MSA-NEXT:    slli.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT:    srai.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT:    shf.w $w2, $w2, 177
+-; MIPS64-MSA-NEXT:    xor.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $5
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $6
+-; MIPS64-MSA-NEXT:    shf.w $w2, $w2, 177
++; MIPS64-MSA-NEXT:    fill.w $w2, $1
++; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
++; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
++; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
++; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+ ; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT:    jr $ra
+@@ -30,26 +28,24 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+ ; MIPS32-MSA:       # %bb.0:
+ ; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT:    lw $1, 28($sp)
++; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT:    fill.w $w2, $4
+-; MIPS32-MSA-NEXT:    ldi.b $w1, -1
+ ; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
++; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+-; MIPS32-MSA-NEXT:    xor.v $w1, $w2, $w1
+ ; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+-; MIPS32-MSA-NEXT:    and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+-; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 20($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
+-; MIPS32-MSA-NEXT:    and.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+ ; MIPS32-MSA-NEXT:    copy_s.w $4, $w0[2]
+@@ -63,21 +59,19 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+ define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v8i16:
+ ; MIPS64-MSA:       # %bb.0:
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT:    sll $1, $4, 0
+-; MIPS64-MSA-NEXT:    ldi.b $w0, -1
+-; MIPS64-MSA-NEXT:    fill.h $w1, $1
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $8
+-; MIPS64-MSA-NEXT:    slli.h $w1, $w1, 15
+-; MIPS64-MSA-NEXT:    srai.h $w1, $w1, 15
+-; MIPS64-MSA-NEXT:    shf.h $w2, $w2, 27
+-; MIPS64-MSA-NEXT:    xor.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $5
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $6
+-; MIPS64-MSA-NEXT:    shf.h $w2, $w2, 27
++; MIPS64-MSA-NEXT:    fill.h $w2, $1
++; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
++; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
++; MIPS64-MSA-NEXT:    slli.h $w2, $w2, 15
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT:    srai.h $w2, $w2, 15
++; MIPS64-MSA-NEXT:    shf.h $w0, $w0, 27
++; MIPS64-MSA-NEXT:    shf.h $w1, $w1, 27
+ ; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    shf.h $w0, $w0, 27
+ ; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT:    jr $ra
+@@ -87,28 +81,26 @@ define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) {
+ ; MIPS32-MSA:       # %bb.0:
+ ; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT:    lw $1, 28($sp)
+-; MIPS32-MSA-NEXT:    fill.h $w1, $4
+-; MIPS32-MSA-NEXT:    ldi.b $w0, -1
+-; MIPS32-MSA-NEXT:    insert.w $w2[0], $2
+-; MIPS32-MSA-NEXT:    slli.h $w1, $w1, 15
+-; MIPS32-MSA-NEXT:    srai.h $w1, $w1, 15
+-; MIPS32-MSA-NEXT:    insert.w $w2[1], $1
++; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
++; MIPS32-MSA-NEXT:    fill.h $w2, $4
++; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
++; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
++; MIPS32-MSA-NEXT:    slli.h $w2, $w2, 15
++; MIPS32-MSA-NEXT:    srai.h $w2, $w2, 15
++; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+-; MIPS32-MSA-NEXT:    xor.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT:    insert.w $w2[2], $1
++; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+-; MIPS32-MSA-NEXT:    insert.w $w2[3], $1
++; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+-; MIPS32-MSA-NEXT:    shf.h $w2, $w2, 177
+-; MIPS32-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT:    insert.w $w2[0], $6
+-; MIPS32-MSA-NEXT:    insert.w $w2[1], $7
+-; MIPS32-MSA-NEXT:    insert.w $w2[2], $1
++; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 20($sp)
+-; MIPS32-MSA-NEXT:    insert.w $w2[3], $1
+-; MIPS32-MSA-NEXT:    shf.h $w2, $w2, 177
++; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT:    shf.h $w0, $w0, 177
++; MIPS32-MSA-NEXT:    shf.h $w1, $w1, 177
+ ; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS32-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    shf.h $w0, $w0, 177
+ ; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+@@ -123,22 +115,21 @@ define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) {
+ define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v16i8:
+ ; MIPS64-MSA:       # %bb.0:
+-; MIPS64-MSA-NEXT:    insert.d $w0[0], $5
+-; MIPS64-MSA-NEXT:    insert.d $w1[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT:    sll $1, $4, 0
+ ; MIPS64-MSA-NEXT:    fill.b $w2, $1
+-; MIPS64-MSA-NEXT:    insert.d $w0[1], $6
+-; MIPS64-MSA-NEXT:    insert.d $w1[1], $8
++; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
++; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
+ ; MIPS64-MSA-NEXT:    slli.b $w2, $w2, 7
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
+ ; MIPS64-MSA-NEXT:    shf.b $w0, $w0, 27
+-; MIPS64-MSA-NEXT:    shf.b $w1, $w1, 27
+ ; MIPS64-MSA-NEXT:    srai.b $w2, $w2, 7
++; MIPS64-MSA-NEXT:    shf.b $w1, $w1, 27
+ ; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+-; MIPS64-MSA-NEXT:    and.v $w0, $w2, $w0
+-; MIPS64-MSA-NEXT:    xori.b $w2, $w2, 255
+-; MIPS64-MSA-NEXT:    and.v $w1, $w2, $w1
+-; MIPS64-MSA-NEXT:    or.v $w0, $w0, $w1
++; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    shf.b $w0, $w0, 27
+ ; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+@@ -147,29 +138,28 @@ define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) {
+ ;
+ ; MIPS32-MSA-LABEL: test_ctselect_v16i8:
+ ; MIPS32-MSA:       # %bb.0:
+-; MIPS32-MSA-NEXT:    insert.w $w0[0], $6
+-; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+ ; MIPS32-MSA-NEXT:    lw $2, 24($sp)
++; MIPS32-MSA-NEXT:    lw $1, 28($sp)
++; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT:    fill.b $w2, $4
+-; MIPS32-MSA-NEXT:    insert.w $w0[1], $7
+-; MIPS32-MSA-NEXT:    insert.w $w1[0], $2
++; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
++; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT:    slli.b $w2, $w2, 7
+ ; MIPS32-MSA-NEXT:    srai.b $w2, $w2, 7
++; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
++; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+-; MIPS32-MSA-NEXT:    lw $1, 20($sp)
++; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+-; MIPS32-MSA-NEXT:    lw $1, 28($sp)
+-; MIPS32-MSA-NEXT:    insert.w $w1[1], $1
+-; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+-; MIPS32-MSA-NEXT:    shf.b $w0, $w0, 27
++; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+-; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+-; MIPS32-MSA-NEXT:    and.v $w0, $w2, $w0
+-; MIPS32-MSA-NEXT:    xori.b $w2, $w2, 255
++; MIPS32-MSA-NEXT:    lw $1, 20($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT:    shf.b $w0, $w0, 27
+ ; MIPS32-MSA-NEXT:    shf.b $w1, $w1, 27
+-; MIPS32-MSA-NEXT:    and.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT:    or.v $w0, $w0, $w1
++; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    shf.b $w0, $w0, 27
+ ; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+@@ -184,18 +174,16 @@ define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) {
+ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v2i64:
+ ; MIPS64-MSA:       # %bb.0:
+-; MIPS64-MSA-NEXT:    fill.d $w2, $4
+ ; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
+-; MIPS64-MSA-NEXT:    ldi.b $w1, -1
+-; MIPS64-MSA-NEXT:    slli.d $w2, $w2, 63
+-; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+-; MIPS64-MSA-NEXT:    srai.d $w2, $w2, 63
+-; MIPS64-MSA-NEXT:    xor.v $w1, $w2, $w1
+-; MIPS64-MSA-NEXT:    and.v $w0, $w1, $w0
+ ; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
++; MIPS64-MSA-NEXT:    fill.d $w2, $4
++; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+ ; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
+-; MIPS64-MSA-NEXT:    and.v $w1, $w2, $w1
+-; MIPS64-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT:    slli.d $w2, $w2, 63
++; MIPS64-MSA-NEXT:    srai.d $w2, $w2, 63
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT:    jr $ra
+ ; MIPS64-MSA-NEXT:    copy_s.d $3, $w0[1]
+@@ -214,31 +202,28 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+ ; MIPS32-MSA-NEXT:    and $sp, $sp, $1
+ ; MIPS32-MSA-NEXT:    lw $2, 56($fp)
+ ; MIPS32-MSA-NEXT:    lw $1, 60($fp)
++; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT:    sw $4, 12($sp)
+ ; MIPS32-MSA-NEXT:    sw $4, 4($sp)
+-; MIPS32-MSA-NEXT:    ldi.b $w0, -1
+-; MIPS32-MSA-NEXT:    ld.d $w1, 0($sp)
+-; MIPS32-MSA-NEXT:    shf.w $w0, $w0, 177
+-; MIPS32-MSA-NEXT:    insert.w $w2[0], $2
+-; MIPS32-MSA-NEXT:    slli.d $w1, $w1, 63
+-; MIPS32-MSA-NEXT:    insert.w $w2[1], $1
++; MIPS32-MSA-NEXT:    ld.d $w2, 0($sp)
++; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
++; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
++; MIPS32-MSA-NEXT:    slli.d $w2, $w2, 63
++; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 64($fp)
+-; MIPS32-MSA-NEXT:    srai.d $w1, $w1, 63
+-; MIPS32-MSA-NEXT:    xor.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT:    insert.w $w2[2], $1
++; MIPS32-MSA-NEXT:    srai.d $w2, $w2, 63
++; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 68($fp)
+-; MIPS32-MSA-NEXT:    insert.w $w2[3], $1
++; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 48($fp)
+-; MIPS32-MSA-NEXT:    shf.w $w2, $w2, 177
+-; MIPS32-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT:    insert.w $w2[0], $6
+-; MIPS32-MSA-NEXT:    insert.w $w2[1], $7
+-; MIPS32-MSA-NEXT:    insert.w $w2[2], $1
++; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 52($fp)
+-; MIPS32-MSA-NEXT:    insert.w $w2[3], $1
+-; MIPS32-MSA-NEXT:    shf.w $w2, $w2, 177
++; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT:    shf.w $w0, $w0, 177
++; MIPS32-MSA-NEXT:    shf.w $w1, $w1, 177
+ ; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS32-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    shf.w $w0, $w0, 177
+ ; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+@@ -257,21 +242,19 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4f32:
+ ; MIPS64-MSA:       # %bb.0:
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT:    sll $1, $4, 0
+-; MIPS64-MSA-NEXT:    ldi.b $w0, -1
+-; MIPS64-MSA-NEXT:    fill.w $w1, $1
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $8
+-; MIPS64-MSA-NEXT:    slli.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT:    srai.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT:    shf.w $w2, $w2, 177
+-; MIPS64-MSA-NEXT:    xor.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $5
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $6
+-; MIPS64-MSA-NEXT:    shf.w $w2, $w2, 177
++; MIPS64-MSA-NEXT:    fill.w $w2, $1
++; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
++; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
++; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
++; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+ ; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT:    jr $ra
+@@ -281,26 +264,24 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b
+ ; MIPS32-MSA:       # %bb.0:
+ ; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT:    lw $1, 28($sp)
++; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT:    fill.w $w2, $5
+-; MIPS32-MSA-NEXT:    ldi.b $w1, -1
+ ; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
++; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+-; MIPS32-MSA-NEXT:    xor.v $w1, $w2, $w1
+ ; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+-; MIPS32-MSA-NEXT:    and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+-; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 20($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
+-; MIPS32-MSA-NEXT:    and.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    jr $ra
+ ; MIPS32-MSA-NEXT:    st.w $w0, 0($4)
+   %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+@@ -311,18 +292,16 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b
+ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v2f64:
+ ; MIPS64-MSA:       # %bb.0:
+-; MIPS64-MSA-NEXT:    fill.d $w2, $4
+ ; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
+-; MIPS64-MSA-NEXT:    ldi.b $w1, -1
+-; MIPS64-MSA-NEXT:    slli.d $w2, $w2, 63
+-; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+-; MIPS64-MSA-NEXT:    srai.d $w2, $w2, 63
+-; MIPS64-MSA-NEXT:    xor.v $w1, $w2, $w1
+-; MIPS64-MSA-NEXT:    and.v $w0, $w1, $w0
+ ; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
++; MIPS64-MSA-NEXT:    fill.d $w2, $4
++; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+ ; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
+-; MIPS64-MSA-NEXT:    and.v $w1, $w2, $w1
+-; MIPS64-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT:    slli.d $w2, $w2, 63
++; MIPS64-MSA-NEXT:    srai.d $w2, $w2, 63
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT:    jr $ra
+ ; MIPS64-MSA-NEXT:    copy_s.d $3, $w0[1]
+@@ -341,31 +320,28 @@ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double>
+ ; MIPS32-MSA-NEXT:    and $sp, $sp, $1
+ ; MIPS32-MSA-NEXT:    lw $2, 56($fp)
+ ; MIPS32-MSA-NEXT:    lw $1, 60($fp)
++; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT:    sw $5, 12($sp)
+ ; MIPS32-MSA-NEXT:    sw $5, 4($sp)
+-; MIPS32-MSA-NEXT:    ldi.b $w0, -1
+-; MIPS32-MSA-NEXT:    ld.d $w1, 0($sp)
+-; MIPS32-MSA-NEXT:    shf.w $w0, $w0, 177
+-; MIPS32-MSA-NEXT:    insert.w $w2[0], $2
+-; MIPS32-MSA-NEXT:    slli.d $w1, $w1, 63
+-; MIPS32-MSA-NEXT:    insert.w $w2[1], $1
++; MIPS32-MSA-NEXT:    ld.d $w2, 0($sp)
++; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
++; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
++; MIPS32-MSA-NEXT:    slli.d $w2, $w2, 63
++; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 64($fp)
+-; MIPS32-MSA-NEXT:    srai.d $w1, $w1, 63
+-; MIPS32-MSA-NEXT:    xor.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT:    insert.w $w2[2], $1
++; MIPS32-MSA-NEXT:    srai.d $w2, $w2, 63
++; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 68($fp)
+-; MIPS32-MSA-NEXT:    insert.w $w2[3], $1
++; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 48($fp)
+-; MIPS32-MSA-NEXT:    shf.w $w2, $w2, 177
+-; MIPS32-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT:    insert.w $w2[0], $6
+-; MIPS32-MSA-NEXT:    insert.w $w2[1], $7
+-; MIPS32-MSA-NEXT:    insert.w $w2[2], $1
++; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 52($fp)
+-; MIPS32-MSA-NEXT:    insert.w $w2[3], $1
+-; MIPS32-MSA-NEXT:    shf.w $w2, $w2, 177
++; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT:    shf.w $w0, $w0, 177
++; MIPS32-MSA-NEXT:    shf.w $w1, $w1, 177
+ ; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS32-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    st.d $w0, 0($4)
+ ; MIPS32-MSA-NEXT:    move $sp, $fp
+ ; MIPS32-MSA-NEXT:    lw $fp, 24($sp) # 4-byte Folded Reload
+@@ -381,16 +357,14 @@ define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_aligned_load:
+ ; MIPS64-MSA:       # %bb.0:
+ ; MIPS64-MSA-NEXT:    sll $1, $4, 0
++; MIPS64-MSA-NEXT:    ld.w $w0, 0($6)
+ ; MIPS64-MSA-NEXT:    ld.w $w1, 0($5)
+-; MIPS64-MSA-NEXT:    ldi.b $w2, -1
+-; MIPS64-MSA-NEXT:    fill.w $w0, $1
+-; MIPS64-MSA-NEXT:    slli.w $w0, $w0, 31
+-; MIPS64-MSA-NEXT:    srai.w $w0, $w0, 31
+-; MIPS64-MSA-NEXT:    and.v $w1, $w0, $w1
+-; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT:    ld.w $w2, 0($6)
+-; MIPS64-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT:    fill.w $w2, $1
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT:    jr $ra
+@@ -398,16 +372,14 @@ define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) {
+ ;
+ ; MIPS32-MSA-LABEL: test_ctselect_v4i32_aligned_load:
+ ; MIPS32-MSA:       # %bb.0:
+-; MIPS32-MSA-NEXT:    fill.w $w0, $4
++; MIPS32-MSA-NEXT:    fill.w $w2, $4
++; MIPS32-MSA-NEXT:    ld.w $w0, 0($6)
+ ; MIPS32-MSA-NEXT:    ld.w $w1, 0($5)
+-; MIPS32-MSA-NEXT:    ldi.b $w2, -1
+-; MIPS32-MSA-NEXT:    slli.w $w0, $w0, 31
+-; MIPS32-MSA-NEXT:    srai.w $w0, $w0, 31
+-; MIPS32-MSA-NEXT:    and.v $w1, $w0, $w1
+-; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT:    ld.w $w2, 0($6)
+-; MIPS32-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+ ; MIPS32-MSA-NEXT:    copy_s.w $4, $w0[2]
+@@ -424,16 +396,14 @@ define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2)
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_unaligned_load:
+ ; MIPS64-MSA:       # %bb.0:
+ ; MIPS64-MSA-NEXT:    sll $1, $4, 0
++; MIPS64-MSA-NEXT:    ld.w $w0, 0($6)
+ ; MIPS64-MSA-NEXT:    ld.w $w1, 0($5)
+-; MIPS64-MSA-NEXT:    ldi.b $w2, -1
+-; MIPS64-MSA-NEXT:    fill.w $w0, $1
+-; MIPS64-MSA-NEXT:    slli.w $w0, $w0, 31
+-; MIPS64-MSA-NEXT:    srai.w $w0, $w0, 31
+-; MIPS64-MSA-NEXT:    and.v $w1, $w0, $w1
+-; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT:    ld.w $w2, 0($6)
+-; MIPS64-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT:    fill.w $w2, $1
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT:    jr $ra
+@@ -441,16 +411,14 @@ define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2)
+ ;
+ ; MIPS32-MSA-LABEL: test_ctselect_v4i32_unaligned_load:
+ ; MIPS32-MSA:       # %bb.0:
+-; MIPS32-MSA-NEXT:    fill.w $w0, $4
++; MIPS32-MSA-NEXT:    fill.w $w2, $4
++; MIPS32-MSA-NEXT:    ld.w $w0, 0($6)
+ ; MIPS32-MSA-NEXT:    ld.w $w1, 0($5)
+-; MIPS32-MSA-NEXT:    ldi.b $w2, -1
+-; MIPS32-MSA-NEXT:    slli.w $w0, $w0, 31
+-; MIPS32-MSA-NEXT:    srai.w $w0, $w0, 31
+-; MIPS32-MSA-NEXT:    and.v $w1, $w0, $w1
+-; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT:    ld.w $w2, 0($6)
+-; MIPS32-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+ ; MIPS32-MSA-NEXT:    copy_s.w $4, $w0[2]
+@@ -466,21 +434,19 @@ define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2)
+ define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr %out) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_store:
+ ; MIPS64-MSA:       # %bb.0:
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT:    sll $1, $4, 0
+-; MIPS64-MSA-NEXT:    ldi.b $w0, -1
+-; MIPS64-MSA-NEXT:    fill.w $w1, $1
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $8
+-; MIPS64-MSA-NEXT:    slli.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT:    srai.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT:    shf.w $w2, $w2, 177
+-; MIPS64-MSA-NEXT:    xor.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $5
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $6
+-; MIPS64-MSA-NEXT:    shf.w $w2, $w2, 177
++; MIPS64-MSA-NEXT:    fill.w $w2, $1
++; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
++; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
++; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
++; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+ ; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    jr $ra
+ ; MIPS64-MSA-NEXT:    st.w $w0, 0($9)
+ ;
+@@ -488,27 +454,25 @@ define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr
+ ; MIPS32-MSA:       # %bb.0:
+ ; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT:    lw $1, 28($sp)
++; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT:    fill.w $w2, $4
+-; MIPS32-MSA-NEXT:    ldi.b $w1, -1
+ ; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
++; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+-; MIPS32-MSA-NEXT:    xor.v $w1, $w2, $w1
+ ; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+-; MIPS32-MSA-NEXT:    and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+-; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 20($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 40($sp)
+-; MIPS32-MSA-NEXT:    and.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    jr $ra
+ ; MIPS32-MSA-NEXT:    st.w $w0, 0($1)
+   %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+@@ -521,31 +485,28 @@ define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a,
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_chain:
+ ; MIPS64-MSA:       # %bb.0:
+ ; MIPS64-MSA-NEXT:    insert.d $w0[0], $8
++; MIPS64-MSA-NEXT:    insert.d $w1[0], $6
+ ; MIPS64-MSA-NEXT:    sll $1, $4, 0
+-; MIPS64-MSA-NEXT:    ldi.b $w1, -1
+ ; MIPS64-MSA-NEXT:    fill.w $w2, $1
+ ; MIPS64-MSA-NEXT:    sll $1, $5, 0
+ ; MIPS64-MSA-NEXT:    insert.d $w0[1], $9
++; MIPS64-MSA-NEXT:    insert.d $w1[1], $7
+ ; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
+ ; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
+ ; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+-; MIPS64-MSA-NEXT:    xor.v $w3, $w2, $w1
+-; MIPS64-MSA-NEXT:    and.v $w0, $w3, $w0
+-; MIPS64-MSA-NEXT:    insert.d $w3[0], $6
+-; MIPS64-MSA-NEXT:    insert.d $w3[1], $7
+-; MIPS64-MSA-NEXT:    shf.w $w3, $w3, 177
+-; MIPS64-MSA-NEXT:    and.v $w2, $w2, $w3
+-; MIPS64-MSA-NEXT:    or.v $w0, $w2, $w0
++; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
++; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
+ ; MIPS64-MSA-NEXT:    fill.w $w2, $1
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
++; MIPS64-MSA-NEXT:    insert.d $w1[0], $10
+ ; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    insert.d $w1[1], $11
+ ; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
+-; MIPS64-MSA-NEXT:    and.v $w0, $w2, $w0
+-; MIPS64-MSA-NEXT:    xor.v $w1, $w2, $w1
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $10
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $11
+-; MIPS64-MSA-NEXT:    shf.w $w2, $w2, 177
+-; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT:    or.v $w0, $w0, $w1
++; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
++; MIPS64-MSA-NEXT:    and.v $w0, $w0, $w2
++; MIPS64-MSA-NEXT:    xor.v $w0, $w1, $w0
+ ; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT:    jr $ra
+@@ -555,41 +516,38 @@ define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a,
+ ; MIPS32-MSA:       # %bb.0:
+ ; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT:    lw $1, 28($sp)
++; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT:    fill.w $w2, $4
+-; MIPS32-MSA-NEXT:    ldi.b $w1, -1
+ ; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
++; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT:    lw $2, 40($sp)
+ ; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+-; MIPS32-MSA-NEXT:    xor.v $w3, $w2, $w1
+ ; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+-; MIPS32-MSA-NEXT:    and.v $w0, $w3, $w0
+-; MIPS32-MSA-NEXT:    insert.w $w3[0], $6
+-; MIPS32-MSA-NEXT:    insert.w $w3[1], $7
+-; MIPS32-MSA-NEXT:    insert.w $w3[2], $1
++; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 20($sp)
+-; MIPS32-MSA-NEXT:    insert.w $w3[3], $1
++; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 44($sp)
+-; MIPS32-MSA-NEXT:    and.v $w2, $w2, $w3
+-; MIPS32-MSA-NEXT:    or.v $w0, $w2, $w0
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
+ ; MIPS32-MSA-NEXT:    fill.w $w2, $5
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
++; MIPS32-MSA-NEXT:    insert.w $w1[0], $2
+ ; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+-; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+-; MIPS32-MSA-NEXT:    and.v $w0, $w2, $w0
+-; MIPS32-MSA-NEXT:    xor.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT:    insert.w $w2[0], $2
+-; MIPS32-MSA-NEXT:    insert.w $w2[1], $1
++; MIPS32-MSA-NEXT:    insert.w $w1[1], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 48($sp)
+-; MIPS32-MSA-NEXT:    insert.w $w2[2], $1
++; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 52($sp)
+-; MIPS32-MSA-NEXT:    insert.w $w2[3], $1
+-; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS32-MSA-NEXT:    or.v $w0, $w0, $w1
++; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
++; MIPS32-MSA-NEXT:    and.v $w0, $w0, $w2
++; MIPS32-MSA-NEXT:    xor.v $w0, $w1, $w0
+ ; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+ ; MIPS32-MSA-NEXT:    copy_s.w $4, $w0[2]
+@@ -607,20 +565,18 @@ define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4
+ ; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
+ ; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT:    sll $1, $4, 0
+-; MIPS64-MSA-NEXT:    fill.w $w3, $1
+ ; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
+ ; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
+-; MIPS64-MSA-NEXT:    slli.w $w3, $w3, 31
+ ; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+-; MIPS64-MSA-NEXT:    srai.w $w3, $w3, 31
+ ; MIPS64-MSA-NEXT:    fadd.w $w2, $w1, $w0
+ ; MIPS64-MSA-NEXT:    fsub.w $w0, $w1, $w0
+-; MIPS64-MSA-NEXT:    ldi.b $w1, -1
+-; MIPS64-MSA-NEXT:    xor.v $w1, $w3, $w1
+-; MIPS64-MSA-NEXT:    and.v $w2, $w3, $w2
+-; MIPS64-MSA-NEXT:    and.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT:    or.v $w0, $w2, $w0
++; MIPS64-MSA-NEXT:    xor.v $w1, $w2, $w0
++; MIPS64-MSA-NEXT:    fill.w $w2, $1
++; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT:    jr $ra
+@@ -631,11 +587,8 @@ define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4
+ ; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT:    lw $1, 28($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+-; MIPS32-MSA-NEXT:    fill.w $w3, $5
+ ; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
+ ; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+-; MIPS32-MSA-NEXT:    slli.w $w3, $w3, 31
+-; MIPS32-MSA-NEXT:    srai.w $w3, $w3, 31
+ ; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+@@ -647,11 +600,12 @@ define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4
+ ; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
+ ; MIPS32-MSA-NEXT:    fadd.w $w2, $w1, $w0
+ ; MIPS32-MSA-NEXT:    fsub.w $w0, $w1, $w0
+-; MIPS32-MSA-NEXT:    ldi.b $w1, -1
+-; MIPS32-MSA-NEXT:    xor.v $w1, $w3, $w1
+-; MIPS32-MSA-NEXT:    and.v $w2, $w3, $w2
+-; MIPS32-MSA-NEXT:    and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT:    or.v $w0, $w2, $w0
++; MIPS32-MSA-NEXT:    xor.v $w1, $w2, $w0
++; MIPS32-MSA-NEXT:    fill.w $w2, $5
++; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    jr $ra
+ ; MIPS32-MSA-NEXT:    st.w $w0, 0($4)
+   %sum = fadd <4 x float> %x, %y
+@@ -664,36 +618,32 @@ define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4
+ define void @test_ctselect_v4i32_mixed(i1 %cond, ptr %p1, ptr %p2, ptr %out) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_mixed:
+ ; MIPS64-MSA:       # %bb.0:
++; MIPS64-MSA-NEXT:    ld.w $w0, 0($6)
++; MIPS64-MSA-NEXT:    ld.w $w1, 0($5)
+ ; MIPS64-MSA-NEXT:    sll $1, $4, 0
+-; MIPS64-MSA-NEXT:    ld.w $w0, 0($5)
+-; MIPS64-MSA-NEXT:    ldi.b $w2, -1
+-; MIPS64-MSA-NEXT:    fill.w $w1, $1
+-; MIPS64-MSA-NEXT:    addvi.w $w0, $w0, 1
+-; MIPS64-MSA-NEXT:    slli.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT:    srai.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT:    and.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT:    ld.w $w2, 0($6)
+-; MIPS64-MSA-NEXT:    addvi.w $w2, $w2, 2
++; MIPS64-MSA-NEXT:    fill.w $w2, $1
++; MIPS64-MSA-NEXT:    addvi.w $w0, $w0, 2
++; MIPS64-MSA-NEXT:    addvi.w $w1, $w1, 1
++; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
+ ; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT:    or.v $w0, $w0, $w1
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    jr $ra
+ ; MIPS64-MSA-NEXT:    st.w $w0, 0($7)
+ ;
+ ; MIPS32-MSA-LABEL: test_ctselect_v4i32_mixed:
+ ; MIPS32-MSA:       # %bb.0:
+-; MIPS32-MSA-NEXT:    ld.w $w0, 0($5)
+-; MIPS32-MSA-NEXT:    fill.w $w1, $4
+-; MIPS32-MSA-NEXT:    ldi.b $w2, -1
+-; MIPS32-MSA-NEXT:    slli.w $w1, $w1, 31
+-; MIPS32-MSA-NEXT:    addvi.w $w0, $w0, 1
+-; MIPS32-MSA-NEXT:    srai.w $w1, $w1, 31
+-; MIPS32-MSA-NEXT:    and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w2
+-; MIPS32-MSA-NEXT:    ld.w $w2, 0($6)
+-; MIPS32-MSA-NEXT:    addvi.w $w2, $w2, 2
++; MIPS32-MSA-NEXT:    ld.w $w0, 0($6)
++; MIPS32-MSA-NEXT:    ld.w $w1, 0($5)
++; MIPS32-MSA-NEXT:    fill.w $w2, $4
++; MIPS32-MSA-NEXT:    addvi.w $w0, $w0, 2
++; MIPS32-MSA-NEXT:    addvi.w $w1, $w1, 1
++; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
+ ; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS32-MSA-NEXT:    or.v $w0, $w0, $w1
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    jr $ra
+ ; MIPS32-MSA-NEXT:    st.w $w0, 0($7)
+   %a = load <4 x i32>, ptr %p1, align 16
+@@ -709,21 +659,19 @@ define void @test_ctselect_v4i32_mixed(i1 %cond, ptr %p1, ptr %p2, ptr %out) {
+ define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b) nounwind {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_args:
+ ; MIPS64-MSA:       # %bb.0:
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT:    sll $1, $4, 0
+-; MIPS64-MSA-NEXT:    ldi.b $w0, -1
+-; MIPS64-MSA-NEXT:    fill.w $w1, $1
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $8
+-; MIPS64-MSA-NEXT:    slli.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT:    srai.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT:    shf.w $w2, $w2, 177
+-; MIPS64-MSA-NEXT:    xor.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $5
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $6
+-; MIPS64-MSA-NEXT:    shf.w $w2, $w2, 177
++; MIPS64-MSA-NEXT:    fill.w $w2, $1
++; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
++; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
++; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
++; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+ ; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT:    jr $ra
+@@ -733,26 +681,24 @@ define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+ ; MIPS32-MSA:       # %bb.0:
+ ; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT:    lw $1, 28($sp)
++; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT:    fill.w $w2, $4
+-; MIPS32-MSA-NEXT:    ldi.b $w1, -1
+ ; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
++; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+-; MIPS32-MSA-NEXT:    xor.v $w1, $w2, $w1
+ ; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+-; MIPS32-MSA-NEXT:    and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+-; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 20($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
+-; MIPS32-MSA-NEXT:    and.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+ ; MIPS32-MSA-NEXT:    copy_s.w $4, $w0[2]
+@@ -766,21 +712,19 @@ define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+ define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_multi_use:
+ ; MIPS64-MSA:       # %bb.0:
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w0[0], $7
++; MIPS64-MSA-NEXT:    insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT:    sll $1, $4, 0
+-; MIPS64-MSA-NEXT:    ldi.b $w0, -1
+-; MIPS64-MSA-NEXT:    fill.w $w1, $1
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $8
+-; MIPS64-MSA-NEXT:    slli.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT:    srai.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT:    shf.w $w2, $w2, 177
+-; MIPS64-MSA-NEXT:    xor.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT:    and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT:    insert.d $w2[0], $5
+-; MIPS64-MSA-NEXT:    insert.d $w2[1], $6
+-; MIPS64-MSA-NEXT:    shf.w $w2, $w2, 177
++; MIPS64-MSA-NEXT:    fill.w $w2, $1
++; MIPS64-MSA-NEXT:    insert.d $w0[1], $8
++; MIPS64-MSA-NEXT:    insert.d $w1[1], $6
++; MIPS64-MSA-NEXT:    slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT:    srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
++; MIPS64-MSA-NEXT:    shf.w $w1, $w1, 177
+ ; MIPS64-MSA-NEXT:    and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT:    addv.w $w0, $w0, $w0
+ ; MIPS64-MSA-NEXT:    shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT:    copy_s.d $2, $w0[0]
+@@ -791,26 +735,24 @@ define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32
+ ; MIPS32-MSA:       # %bb.0:
+ ; MIPS32-MSA-NEXT:    lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT:    lw $1, 28($sp)
++; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT:    fill.w $w2, $4
+-; MIPS32-MSA-NEXT:    ldi.b $w1, -1
+ ; MIPS32-MSA-NEXT:    insert.w $w0[0], $2
++; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT:    slli.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT:    srai.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT:    insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 32($sp)
+-; MIPS32-MSA-NEXT:    xor.v $w1, $w2, $w1
+ ; MIPS32-MSA-NEXT:    insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 16($sp)
+-; MIPS32-MSA-NEXT:    and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT:    insert.w $w1[0], $6
+-; MIPS32-MSA-NEXT:    insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT:    insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT:    lw $1, 20($sp)
+ ; MIPS32-MSA-NEXT:    insert.w $w1[3], $1
+-; MIPS32-MSA-NEXT:    and.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT:    or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT:    xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT:    and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT:    xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT:    addv.w $w0, $w0, $w0
+ ; MIPS32-MSA-NEXT:    copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT:    copy_s.w $3, $w0[1]
+diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback.ll b/llvm/test/CodeGen/Mips/ctselect-fallback.ll
+index d89d7fc69871..6a61412367f7 100644
+--- a/llvm/test/CodeGen/Mips/ctselect-fallback.ll
++++ b/llvm/test/CodeGen/Mips/ctselect-fallback.ll
+@@ -11,7 +11,7 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+ ; M32-NEXT:    negu $2, $2
+ ; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    xor $2, $1, $6
++; M32-NEXT:    xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_i8:
+ ; M64:       # %bb.0:
+@@ -23,7 +23,7 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+ ; M64-NEXT:    and $1, $2, $1
+ ; M64-NEXT:    sll $2, $6, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    xor $2, $1, $2
++; M64-NEXT:    xor $2, $2, $1
+   %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+   ret i8 %result
+ }
+@@ -36,7 +36,7 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+ ; M32-NEXT:    negu $2, $2
+ ; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    xor $2, $1, $6
++; M32-NEXT:    xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_i16:
+ ; M64:       # %bb.0:
+@@ -48,7 +48,7 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+ ; M64-NEXT:    and $1, $2, $1
+ ; M64-NEXT:    sll $2, $6, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    xor $2, $1, $2
++; M64-NEXT:    xor $2, $2, $1
+   %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+   ret i16 %result
+ }
+@@ -56,26 +56,24 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_i32:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    andi $1, $4, 1
+-; M32-NEXT:    negu $2, $1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $2, $5
+-; M32-NEXT:    and $1, $1, $6
++; M32-NEXT:    andi $2, $4, 1
++; M32-NEXT:    xor $1, $5, $6
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_i32:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $4, 0
+-; M64-NEXT:    sll $3, $5, 0
++; M64-NEXT:    xor $2, $5, $6
+ ; M64-NEXT:    andi $1, $1, 1
+-; M64-NEXT:    negu $2, $1
+-; M64-NEXT:    addiu $1, $1, -1
+-; M64-NEXT:    and $2, $2, $3
+-; M64-NEXT:    sll $3, $6, 0
+-; M64-NEXT:    and $1, $1, $3
++; M64-NEXT:    sll $2, $2, 0
++; M64-NEXT:    negu $1, $1
++; M64-NEXT:    and $1, $2, $1
++; M64-NEXT:    sll $2, $6, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $2, $1
+   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+   ret i32 %result
+ }
+@@ -88,22 +86,21 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
+ ; M32-NEXT:    negu $3, $3
+ ; M32-NEXT:    xor $2, $6, $1
+ ; M32-NEXT:    and $2, $2, $3
+-; M32-NEXT:    xor $2, $2, $1
++; M32-NEXT:    xor $2, $1, $2
+ ; M32-NEXT:    lw $1, 20($sp)
+ ; M32-NEXT:    xor $4, $7, $1
+ ; M32-NEXT:    and $3, $4, $3
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    xor $3, $3, $1
++; M32-NEXT:    xor $3, $1, $3
+ ;
+ ; M64-LABEL: test_ctselect_i64:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    andi $1, $4, 1
+-; M64-NEXT:    dnegu $2, $1
+-; M64-NEXT:    daddiu $1, $1, -1
+-; M64-NEXT:    and $2, $2, $5
+-; M64-NEXT:    and $1, $1, $6
++; M64-NEXT:    andi $2, $4, 1
++; M64-NEXT:    xor $1, $5, $6
++; M64-NEXT:    dnegu $2, $2
++; M64-NEXT:    and $1, $1, $2
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $6, $1
+   %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+   ret i64 %result
+ }
+@@ -111,23 +108,21 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
+ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
+ ; M32-LABEL: test_ctselect_ptr:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    andi $1, $4, 1
+-; M32-NEXT:    negu $2, $1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $2, $5
+-; M32-NEXT:    and $1, $1, $6
++; M32-NEXT:    andi $2, $4, 1
++; M32-NEXT:    xor $1, $5, $6
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_ptr:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    andi $1, $4, 1
+-; M64-NEXT:    dnegu $2, $1
+-; M64-NEXT:    daddiu $1, $1, -1
+-; M64-NEXT:    and $2, $2, $5
+-; M64-NEXT:    and $1, $1, $6
++; M64-NEXT:    andi $2, $4, 1
++; M64-NEXT:    xor $1, $5, $6
++; M64-NEXT:    dnegu $2, $2
++; M64-NEXT:    and $1, $1, $2
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $6, $1
+   %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+   ret ptr %result
+ }
+@@ -151,13 +146,12 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_const_false:
+ ; M32:       # %bb.0:
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $zero, $5
++; M32-NEXT:    move $2, $5
+ ;
+ ; M64-LABEL: test_ctselect_const_false:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    sll $1, $5, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $zero, $1
++; M64-NEXT:    sll $2, $5, 0
+   %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+   ret i32 %result
+ }
+@@ -166,29 +160,27 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
+ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_icmp_eq:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    xor $1, $4, $5
+-; M32-NEXT:    sltu $1, $zero, $1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $1, $6
+-; M32-NEXT:    not $1, $1
+-; M32-NEXT:    and $1, $1, $7
++; M32-NEXT:    xor $2, $4, $5
++; M32-NEXT:    xor $1, $6, $7
++; M32-NEXT:    sltiu $2, $2, 1
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $7, $1
+ ;
+ ; M64-LABEL: test_ctselect_icmp_eq:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $5, 0
+ ; M64-NEXT:    sll $2, $4, 0
+-; M64-NEXT:    sll $3, $7, 0
+ ; M64-NEXT:    xor $1, $2, $1
+-; M64-NEXT:    sll $2, $6, 0
+-; M64-NEXT:    sltu $1, $zero, $1
+-; M64-NEXT:    addiu $1, $1, -1
+-; M64-NEXT:    and $2, $1, $2
+-; M64-NEXT:    not $1, $1
+-; M64-NEXT:    and $1, $1, $3
++; M64-NEXT:    xor $2, $6, $7
++; M64-NEXT:    sltiu $1, $1, 1
++; M64-NEXT:    sll $2, $2, 0
++; M64-NEXT:    negu $1, $1
++; M64-NEXT:    and $1, $2, $1
++; M64-NEXT:    sll $2, $7, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $2, $1
+   %cond = icmp eq i32 %x, %y
+   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+   ret i32 %result
+@@ -197,29 +189,27 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
+ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_icmp_ne:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    xor $1, $4, $5
+-; M32-NEXT:    sltiu $1, $1, 1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $1, $6
+-; M32-NEXT:    not $1, $1
+-; M32-NEXT:    and $1, $1, $7
++; M32-NEXT:    xor $2, $4, $5
++; M32-NEXT:    xor $1, $6, $7
++; M32-NEXT:    sltu $2, $zero, $2
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $7, $1
+ ;
+ ; M64-LABEL: test_ctselect_icmp_ne:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $5, 0
+ ; M64-NEXT:    sll $2, $4, 0
+-; M64-NEXT:    sll $3, $7, 0
+ ; M64-NEXT:    xor $1, $2, $1
+-; M64-NEXT:    sll $2, $6, 0
+-; M64-NEXT:    sltiu $1, $1, 1
+-; M64-NEXT:    addiu $1, $1, -1
+-; M64-NEXT:    and $2, $1, $2
+-; M64-NEXT:    not $1, $1
+-; M64-NEXT:    and $1, $1, $3
++; M64-NEXT:    xor $2, $6, $7
++; M64-NEXT:    sltu $1, $zero, $1
++; M64-NEXT:    sll $2, $2, 0
++; M64-NEXT:    negu $1, $1
++; M64-NEXT:    and $1, $2, $1
++; M64-NEXT:    sll $2, $7, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $2, $1
+   %cond = icmp ne i32 %x, %y
+   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+   ret i32 %result
+@@ -228,29 +218,25 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_icmp_slt:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    slt $1, $4, $5
+-; M32-NEXT:    xori $1, $1, 1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $1, $6
+-; M32-NEXT:    not $1, $1
+-; M32-NEXT:    and $1, $1, $7
++; M32-NEXT:    slt $2, $4, $5
++; M32-NEXT:    xor $1, $6, $7
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $7, $1
+ ;
+ ; M64-LABEL: test_ctselect_icmp_slt:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $5, 0
+ ; M64-NEXT:    sll $2, $4, 0
+-; M64-NEXT:    sll $3, $7, 0
+ ; M64-NEXT:    slt $1, $2, $1
+-; M64-NEXT:    sll $2, $6, 0
+-; M64-NEXT:    xori $1, $1, 1
+-; M64-NEXT:    addiu $1, $1, -1
+-; M64-NEXT:    and $2, $1, $2
+-; M64-NEXT:    not $1, $1
+-; M64-NEXT:    and $1, $1, $3
++; M64-NEXT:    xor $2, $6, $7
++; M64-NEXT:    negu $1, $1
++; M64-NEXT:    sll $2, $2, 0
++; M64-NEXT:    and $1, $2, $1
++; M64-NEXT:    sll $2, $7, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $2, $1
+   %cond = icmp slt i32 %x, %y
+   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+   ret i32 %result
+@@ -259,29 +245,25 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
+ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_icmp_ult:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    sltu $1, $4, $5
+-; M32-NEXT:    xori $1, $1, 1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $1, $6
+-; M32-NEXT:    not $1, $1
+-; M32-NEXT:    and $1, $1, $7
++; M32-NEXT:    sltu $2, $4, $5
++; M32-NEXT:    xor $1, $6, $7
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $7, $1
+ ;
+ ; M64-LABEL: test_ctselect_icmp_ult:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $5, 0
+ ; M64-NEXT:    sll $2, $4, 0
+-; M64-NEXT:    sll $3, $7, 0
+ ; M64-NEXT:    sltu $1, $2, $1
+-; M64-NEXT:    sll $2, $6, 0
+-; M64-NEXT:    xori $1, $1, 1
+-; M64-NEXT:    addiu $1, $1, -1
+-; M64-NEXT:    and $2, $1, $2
+-; M64-NEXT:    not $1, $1
+-; M64-NEXT:    and $1, $1, $3
++; M64-NEXT:    xor $2, $6, $7
++; M64-NEXT:    negu $1, $1
++; M64-NEXT:    sll $2, $2, 0
++; M64-NEXT:    and $1, $2, $1
++; M64-NEXT:    sll $2, $7, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $2, $1
+   %cond = icmp ult i32 %x, %y
+   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+   ret i32 %result
+@@ -291,28 +273,26 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+ ; M32-LABEL: test_ctselect_load:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    andi $1, $4, 1
++; M32-NEXT:    lw $2, 0($6)
+ ; M32-NEXT:    lw $3, 0($5)
+-; M32-NEXT:    negu $2, $1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $2, $3
+-; M32-NEXT:    lw $3, 0($6)
+-; M32-NEXT:    and $1, $1, $3
++; M32-NEXT:    andi $1, $4, 1
++; M32-NEXT:    negu $1, $1
++; M32-NEXT:    xor $3, $3, $2
++; M32-NEXT:    and $1, $3, $1
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $2, $1
+ ;
+ ; M64-LABEL: test_ctselect_load:
+ ; M64:       # %bb.0:
+-; M64-NEXT:    sll $1, $4, 0
+-; M64-NEXT:    lw $3, 0($5)
+-; M64-NEXT:    andi $1, $1, 1
+-; M64-NEXT:    negu $2, $1
+-; M64-NEXT:    addiu $1, $1, -1
++; M64-NEXT:    sll $3, $4, 0
++; M64-NEXT:    lw $1, 0($6)
++; M64-NEXT:    lw $2, 0($5)
++; M64-NEXT:    andi $3, $3, 1
++; M64-NEXT:    xor $2, $2, $1
++; M64-NEXT:    negu $3, $3
+ ; M64-NEXT:    and $2, $2, $3
+-; M64-NEXT:    lw $3, 0($6)
+-; M64-NEXT:    and $1, $1, $3
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $1, $2
+   %a = load i32, ptr %p1
+   %b = load i32, ptr %p2
+   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+@@ -323,41 +303,37 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+ ; M32-LABEL: test_ctselect_nested:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    andi $1, $5, 1
++; M32-NEXT:    andi $2, $5, 1
++; M32-NEXT:    xor $1, $6, $7
+ ; M32-NEXT:    andi $3, $4, 1
+-; M32-NEXT:    negu $2, $1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    negu $4, $3
+-; M32-NEXT:    and $2, $2, $6
+-; M32-NEXT:    and $1, $1, $7
+-; M32-NEXT:    or $1, $2, $1
+-; M32-NEXT:    addiu $2, $3, -1
+-; M32-NEXT:    lw $3, 16($sp)
+-; M32-NEXT:    and $1, $4, $1
+-; M32-NEXT:    and $2, $2, $3
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    negu $3, $3
++; M32-NEXT:    and $1, $1, $2
++; M32-NEXT:    lw $2, 16($sp)
++; M32-NEXT:    xor $1, $7, $1
++; M32-NEXT:    xor $1, $1, $2
++; M32-NEXT:    and $1, $1, $3
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $1, $2
++; M32-NEXT:    xor $2, $2, $1
+ ;
+ ; M64-LABEL: test_ctselect_nested:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $5, 0
+-; M64-NEXT:    sll $3, $6, 0
+-; M64-NEXT:    sll $4, $4, 0
++; M64-NEXT:    xor $2, $6, $7
++; M64-NEXT:    sll $3, $4, 0
+ ; M64-NEXT:    andi $1, $1, 1
+-; M64-NEXT:    andi $4, $4, 1
+-; M64-NEXT:    negu $2, $1
+-; M64-NEXT:    addiu $1, $1, -1
+-; M64-NEXT:    negu $5, $4
+-; M64-NEXT:    and $2, $2, $3
+-; M64-NEXT:    sll $3, $7, 0
+-; M64-NEXT:    and $1, $1, $3
+-; M64-NEXT:    addiu $3, $4, -1
+-; M64-NEXT:    or $1, $2, $1
++; M64-NEXT:    sll $2, $2, 0
++; M64-NEXT:    andi $3, $3, 1
++; M64-NEXT:    negu $1, $1
++; M64-NEXT:    negu $3, $3
++; M64-NEXT:    and $1, $2, $1
++; M64-NEXT:    sll $2, $7, 0
++; M64-NEXT:    xor $1, $2, $1
+ ; M64-NEXT:    sll $2, $8, 0
+-; M64-NEXT:    and $1, $5, $1
+-; M64-NEXT:    and $2, $3, $2
++; M64-NEXT:    xor $1, $1, $2
++; M64-NEXT:    and $1, $1, $3
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $1, $2
++; M64-NEXT:    xor $2, $2, $1
+   %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+   %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
+   ret i32 %result
+diff --git a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll
+index 6cfa07afdd51..069100e2d2a7 100644
+--- a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll
++++ b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll
+@@ -38,26 +38,24 @@ define i32 @test_constant_fold() {
+ define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) {
+ ; M32-LABEL: test_protected_no_branch:
+ ; M32:       # %bb.0:
+-; M32-NEXT:    andi $1, $4, 1
+-; M32-NEXT:    negu $2, $1
+-; M32-NEXT:    addiu $1, $1, -1
+-; M32-NEXT:    and $2, $2, $5
+-; M32-NEXT:    and $1, $1, $6
++; M32-NEXT:    andi $2, $4, 1
++; M32-NEXT:    xor $1, $5, $6
++; M32-NEXT:    negu $2, $2
++; M32-NEXT:    and $1, $1, $2
+ ; M32-NEXT:    jr $ra
+-; M32-NEXT:    or $2, $2, $1
++; M32-NEXT:    xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_protected_no_branch:
+ ; M64:       # %bb.0:
+ ; M64-NEXT:    sll $1, $4, 0
+-; M64-NEXT:    sll $3, $5, 0
++; M64-NEXT:    xor $2, $5, $6
+ ; M64-NEXT:    andi $1, $1, 1
+-; M64-NEXT:    negu $2, $1
+-; M64-NEXT:    addiu $1, $1, -1
+-; M64-NEXT:    and $2, $2, $3
+-; M64-NEXT:    sll $3, $6, 0
+-; M64-NEXT:    and $1, $1, $3
++; M64-NEXT:    sll $2, $2, 0
++; M64-NEXT:    negu $1, $1
++; M64-NEXT:    and $1, $2, $1
++; M64-NEXT:    sll $2, $6, 0
+ ; M64-NEXT:    jr $ra
+-; M64-NEXT:    or $2, $2, $1
++; M64-NEXT:    xor $2, $2, $1
+   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+   ret i32 %result
+ }
+diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
+index d4617c7e75da..ee8072703ee3 100644
+--- a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
++++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
+@@ -101,8 +101,6 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
+ ;
+ ; RV32-LABEL: test_ctselect_const_true:
+ ; RV32:       # %bb.0:
+-; RV32-NEXT:    xor a0, a0, a1
+-; RV32-NEXT:    xor a0, a1, a0
+ ; RV32-NEXT:    ret
+   %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+   ret i32 %result
+@@ -208,7 +206,7 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ ; RV64-LABEL: test_ctselect_nested_and_i1_to_i32:
+ ; RV64:       # %bb.0:
+-; RV64-NEXT:    and a0, a1, a0
++; RV64-NEXT:    and a0, a0, a1
+ ; RV64-NEXT:    xor a2, a2, a3
+ ; RV64-NEXT:    slli a0, a0, 63
+ ; RV64-NEXT:    srai a0, a0, 63
+@@ -218,7 +216,7 @@ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ ;
+ ; RV32-LABEL: test_ctselect_nested_and_i1_to_i32:
+ ; RV32:       # %bb.0:
+-; RV32-NEXT:    and a0, a1, a0
++; RV32-NEXT:    and a0, a0, a1
+ ; RV32-NEXT:    xor a2, a2, a3
+ ; RV32-NEXT:    slli a0, a0, 31
+ ; RV32-NEXT:    srai a0, a0, 31
+@@ -265,8 +263,8 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) {
+ ; RV64-LABEL: test_ctselect_double_nested_and_i1:
+ ; RV64:       # %bb.0:
+-; RV64-NEXT:    and a1, a2, a1
+-; RV64-NEXT:    and a0, a1, a0
++; RV64-NEXT:    and a0, a0, a1
++; RV64-NEXT:    and a0, a0, a2
+ ; RV64-NEXT:    xor a3, a3, a4
+ ; RV64-NEXT:    slli a0, a0, 63
+ ; RV64-NEXT:    srai a0, a0, 63
+@@ -276,8 +274,8 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
+ ;
+ ; RV32-LABEL: test_ctselect_double_nested_and_i1:
+ ; RV32:       # %bb.0:
+-; RV32-NEXT:    and a1, a2, a1
+-; RV32-NEXT:    and a0, a1, a0
++; RV32-NEXT:    and a0, a0, a1
++; RV32-NEXT:    and a0, a0, a2
+ ; RV32-NEXT:    xor a3, a3, a4
+ ; RV32-NEXT:    slli a0, a0, 31
+ ; RV32-NEXT:    srai a0, a0, 31
+@@ -295,7 +293,7 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
+ define i32 @test_ctselect_double_nested_mixed_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y, i32 %z) {
+ ; RV64-LABEL: test_ctselect_double_nested_mixed_i1:
+ ; RV64:       # %bb.0:
+-; RV64-NEXT:    and a0, a1, a0
++; RV64-NEXT:    and a0, a0, a1
+ ; RV64-NEXT:    xor a3, a3, a4
+ ; RV64-NEXT:    or a0, a0, a2
+ ; RV64-NEXT:    slli a0, a0, 63
+@@ -309,7 +307,7 @@ define i32 @test_ctselect_double_nested_mixed_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x,
+ ;
+ ; RV32-LABEL: test_ctselect_double_nested_mixed_i1:
+ ; RV32:       # %bb.0:
+-; RV32-NEXT:    and a0, a1, a0
++; RV32-NEXT:    and a0, a0, a1
+ ; RV32-NEXT:    xor a3, a3, a4
+ ; RV32-NEXT:    or a0, a0, a2
+ ; RV32-NEXT:    slli a0, a0, 31
+@@ -382,7 +380,7 @@ define float @test_ctselect_f32_nan_inf(i1 %cond) {
+ ; RV32-NEXT:    srai a0, a0, 31
+ ; RV32-NEXT:    and a0, a0, a1
+ ; RV32-NEXT:    lui a1, 522240
+-; RV32-NEXT:    xor a0, a0, a1
++; RV32-NEXT:    or a0, a0, a1
+ ; RV32-NEXT:    ret
+   %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
+   ret float %result
+@@ -398,7 +396,7 @@ define double @test_ctselect_f64_nan_inf(i1 %cond) {
+ ; RV64-NEXT:    and a0, a0, a1
+ ; RV64-NEXT:    li a1, 2047
+ ; RV64-NEXT:    slli a1, a1, 52
+-; RV64-NEXT:    xor a0, a0, a1
++; RV64-NEXT:    or a0, a0, a1
+ ; RV64-NEXT:    ret
+ ;
+ ; RV32-LABEL: test_ctselect_f64_nan_inf:
+diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
+index bf65e04721df..e1abae80cef4 100644
+--- a/llvm/test/CodeGen/X86/ctselect.ll
++++ b/llvm/test/CodeGen/X86/ctselect.ll
+@@ -9,8 +9,8 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+ ; X64-LABEL: test_ctselect_i8:
+ ; X64:       # %bb.0:
+ ; X64-NEXT:    movl %edi, %eax
+-; X64-NEXT:    xorl %edx, %esi
+ ; X64-NEXT:    andb $1, %al
++; X64-NEXT:    xorl %edx, %esi
+ ; X64-NEXT:    negb %al
+ ; X64-NEXT:    andb %sil, %al
+ ; X64-NEXT:    xorb %dl, %al
+@@ -20,10 +20,10 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+ ; X32-LABEL: test_ctselect_i8:
+ ; X32:       # %bb.0:
+ ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
++; X32-NEXT:    andb $1, %al
+ ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT:    xorb %cl, %dl
+-; X32-NEXT:    andb $1, %al
+ ; X32-NEXT:    negb %al
+ ; X32-NEXT:    andb %dl, %al
+ ; X32-NEXT:    xorb %cl, %al
+@@ -32,10 +32,10 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+ ; X32-NOCMOV-LABEL: test_ctselect_i8:
+ ; X32-NOCMOV:       # %bb.0:
+ ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT:    andb $1, %al
+ ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT:    xorb %cl, %dl
+-; X32-NOCMOV-NEXT:    andb $1, %al
+ ; X32-NOCMOV-NEXT:    negb %al
+ ; X32-NOCMOV-NEXT:    andb %dl, %al
+ ; X32-NOCMOV-NEXT:    xorb %cl, %al
+@@ -58,10 +58,11 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+ ; X32-LABEL: test_ctselect_i32:
+ ; X32:       # %bb.0:
+ ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
++; X32-NEXT:    andb $1, %al
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT:    xorl %ecx, %edx
+-; X32-NEXT:    andl $1, %eax
++; X32-NEXT:    movzbl %al, %eax
+ ; X32-NEXT:    negl %eax
+ ; X32-NEXT:    andl %edx, %eax
+ ; X32-NEXT:    xorl %ecx, %eax
+@@ -70,10 +71,11 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+ ; X32-NOCMOV-LABEL: test_ctselect_i32:
+ ; X32-NOCMOV:       # %bb.0:
+ ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT:    andb $1, %al
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+-; X32-NOCMOV-NEXT:    andl $1, %eax
++; X32-NOCMOV-NEXT:    movzbl %al, %eax
+ ; X32-NOCMOV-NEXT:    negl %eax
+ ; X32-NOCMOV-NEXT:    andl %edx, %eax
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+@@ -95,45 +97,57 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
+ ;
+ ; X32-LABEL: test_ctselect_i64:
+ ; X32:       # %bb.0:
+-; X32-NEXT:    pushl %esi
++; X32-NEXT:    pushl %edi
+ ; X32-NEXT:    .cfi_def_cfa_offset 8
+-; X32-NEXT:    .cfi_offset %esi, -8
+-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
++; X32-NEXT:    pushl %esi
++; X32-NEXT:    .cfi_def_cfa_offset 12
++; X32-NEXT:    .cfi_offset %esi, -12
++; X32-NEXT:    .cfi_offset %edi, -8
++; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
++; X32-NEXT:    andb $1, %dl
++; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+-; X32-NEXT:    xorl %edx, %eax
+-; X32-NEXT:    andl $1, %esi
+-; X32-NEXT:    negl %esi
+-; X32-NEXT:    andl %esi, %eax
+-; X32-NEXT:    xorl %edx, %eax
++; X32-NEXT:    xorl %esi, %eax
++; X32-NEXT:    movzbl %dl, %edi
++; X32-NEXT:    negl %edi
++; X32-NEXT:    andl %edi, %eax
++; X32-NEXT:    xorl %esi, %eax
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT:    xorl %ecx, %edx
+-; X32-NEXT:    andl %esi, %edx
++; X32-NEXT:    andl %edi, %edx
+ ; X32-NEXT:    xorl %ecx, %edx
+ ; X32-NEXT:    popl %esi
++; X32-NEXT:    .cfi_def_cfa_offset 8
++; X32-NEXT:    popl %edi
+ ; X32-NEXT:    .cfi_def_cfa_offset 4
+ ; X32-NEXT:    retl
+ ;
+ ; X32-NOCMOV-LABEL: test_ctselect_i64:
+ ; X32-NOCMOV:       # %bb.0:
+-; X32-NOCMOV-NEXT:    pushl %esi
++; X32-NOCMOV-NEXT:    pushl %edi
+ ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -8
+-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
++; X32-NOCMOV-NEXT:    pushl %esi
++; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
++; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
++; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
++; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
++; X32-NOCMOV-NEXT:    andb $1, %dl
++; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+-; X32-NOCMOV-NEXT:    xorl %edx, %eax
+-; X32-NOCMOV-NEXT:    andl $1, %esi
+-; X32-NOCMOV-NEXT:    negl %esi
+-; X32-NOCMOV-NEXT:    andl %esi, %eax
+-; X32-NOCMOV-NEXT:    xorl %edx, %eax
++; X32-NOCMOV-NEXT:    xorl %esi, %eax
++; X32-NOCMOV-NEXT:    movzbl %dl, %edi
++; X32-NOCMOV-NEXT:    negl %edi
++; X32-NOCMOV-NEXT:    andl %edi, %eax
++; X32-NOCMOV-NEXT:    xorl %esi, %eax
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+-; X32-NOCMOV-NEXT:    andl %esi, %edx
++; X32-NOCMOV-NEXT:    andl %edi, %edx
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+ ; X32-NOCMOV-NEXT:    popl %esi
++; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
++; X32-NOCMOV-NEXT:    popl %edi
+ ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+ ; X32-NOCMOV-NEXT:    retl
+   %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+@@ -155,37 +169,47 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
+ ;
+ ; X32-LABEL: test_ctselect_f32:
+ ; X32:       # %bb.0:
+-; X32-NEXT:    pushl %eax
+-; X32-NEXT:    .cfi_def_cfa_offset 8
++; X32-NEXT:    subl $12, %esp
++; X32-NEXT:    .cfi_def_cfa_offset 16
++; X32-NEXT:    flds {{[0-9]+}}(%esp)
++; X32-NEXT:    fstps {{[0-9]+}}(%esp)
++; X32-NEXT:    flds {{[0-9]+}}(%esp)
++; X32-NEXT:    fstps (%esp)
+ ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
++; X32-NEXT:    andb $1, %al
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+-; X32-NEXT:    xorl %ecx, %edx
+-; X32-NEXT:    andl $1, %eax
++; X32-NEXT:    movzbl %al, %eax
+ ; X32-NEXT:    negl %eax
+-; X32-NEXT:    andl %edx, %eax
+-; X32-NEXT:    xorl %ecx, %eax
+-; X32-NEXT:    movl %eax, (%esp)
+-; X32-NEXT:    flds (%esp)
+-; X32-NEXT:    popl %eax
++; X32-NEXT:    movl (%esp), %edx
++; X32-NEXT:    xorl %ecx, %edx
++; X32-NEXT:    andl %eax, %edx
++; X32-NEXT:    xorl %ecx, %edx
++; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
++; X32-NEXT:    flds {{[0-9]+}}(%esp)
++; X32-NEXT:    addl $12, %esp
+ ; X32-NEXT:    .cfi_def_cfa_offset 4
+ ; X32-NEXT:    retl
+ ;
+ ; X32-NOCMOV-LABEL: test_ctselect_f32:
+ ; X32-NOCMOV:       # %bb.0:
+-; X32-NOCMOV-NEXT:    pushl %eax
+-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
++; X32-NOCMOV-NEXT:    subl $12, %esp
++; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
++; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT:    fstps {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT:    fstps (%esp)
+ ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT:    andb $1, %al
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+-; X32-NOCMOV-NEXT:    andl $1, %eax
++; X32-NOCMOV-NEXT:    movzbl %al, %eax
+ ; X32-NOCMOV-NEXT:    negl %eax
+-; X32-NOCMOV-NEXT:    andl %edx, %eax
+-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+-; X32-NOCMOV-NEXT:    movl %eax, (%esp)
+-; X32-NOCMOV-NEXT:    flds (%esp)
+-; X32-NOCMOV-NEXT:    popl %eax
++; X32-NOCMOV-NEXT:    movl (%esp), %edx
++; X32-NOCMOV-NEXT:    xorl %ecx, %edx
++; X32-NOCMOV-NEXT:    andl %eax, %edx
++; X32-NOCMOV-NEXT:    xorl %ecx, %edx
++; X32-NOCMOV-NEXT:    movl %edx, {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT:    addl $12, %esp
+ ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+ ; X32-NOCMOV-NEXT:    retl
+   %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+@@ -281,10 +305,11 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
+ ; X32-LABEL: test_ctselect_ptr:
+ ; X32:       # %bb.0:
+ ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
++; X32-NEXT:    andb $1, %al
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT:    xorl %ecx, %edx
+-; X32-NEXT:    andl $1, %eax
++; X32-NEXT:    movzbl %al, %eax
+ ; X32-NEXT:    negl %eax
+ ; X32-NEXT:    andl %edx, %eax
+ ; X32-NEXT:    xorl %ecx, %eax
+@@ -293,10 +318,11 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
+ ; X32-NOCMOV-LABEL: test_ctselect_ptr:
+ ; X32-NOCMOV:       # %bb.0:
+ ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT:    andb $1, %al
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+-; X32-NOCMOV-NEXT:    andl $1, %eax
++; X32-NOCMOV-NEXT:    movzbl %al, %eax
+ ; X32-NOCMOV-NEXT:    negl %eax
+ ; X32-NOCMOV-NEXT:    andl %edx, %eax
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+@@ -310,24 +336,16 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
+ ; X64-LABEL: test_ctselect_const_true:
+ ; X64:       # %bb.0:
+ ; X64-NEXT:    movl %edi, %eax
+-; X64-NEXT:    xorl %esi, %eax
+-; X64-NEXT:    xorl %esi, %eax
+ ; X64-NEXT:    retq
+ ;
+ ; X32-LABEL: test_ctselect_const_true:
+ ; X32:       # %bb.0:
+-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+-; X32-NEXT:    xorl %ecx, %eax
+-; X32-NEXT:    xorl %ecx, %eax
+ ; X32-NEXT:    retl
+ ;
+ ; X32-NOCMOV-LABEL: test_ctselect_const_true:
+ ; X32-NOCMOV:       # %bb.0:
+-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+-; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+ ; X32-NOCMOV-NEXT:    retl
+   %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+   ret i32 %result
+@@ -341,14 +359,12 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
+ ;
+ ; X32-LABEL: test_ctselect_const_false:
+ ; X32:       # %bb.0:
+-; X32-NEXT:    xorl %eax, %eax
+-; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
++; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+ ; X32-NEXT:    retl
+ ;
+ ; X32-NOCMOV-LABEL: test_ctselect_const_false:
+ ; X32-NOCMOV:       # %bb.0:
+-; X32-NOCMOV-NEXT:    xorl %eax, %eax
+-; X32-NOCMOV-NEXT:    xorl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+ ; X32-NOCMOV-NEXT:    retl
+   %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+   ret i32 %result
+@@ -443,19 +459,20 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
+ ; X64-LABEL: test_ctselect_fcmp_oeq:
+ ; X64:       # %bb.0:
+-; X64-NEXT:    movd %xmm3, %eax
+ ; X64-NEXT:    cmpeqss %xmm1, %xmm0
+-; X64-NEXT:    pxor %xmm3, %xmm2
+-; X64-NEXT:    pand %xmm0, %xmm2
+-; X64-NEXT:    movd %xmm2, %ecx
+-; X64-NEXT:    xorl %eax, %ecx
+-; X64-NEXT:    movd %ecx, %xmm0
++; X64-NEXT:    xorps %xmm3, %xmm2
++; X64-NEXT:    andps %xmm2, %xmm0
++; X64-NEXT:    xorps %xmm3, %xmm0
+ ; X64-NEXT:    retq
+ ;
+ ; X32-LABEL: test_ctselect_fcmp_oeq:
+ ; X32:       # %bb.0:
+-; X32-NEXT:    pushl %eax
+-; X32-NEXT:    .cfi_def_cfa_offset 8
++; X32-NEXT:    subl $12, %esp
++; X32-NEXT:    .cfi_def_cfa_offset 16
++; X32-NEXT:    flds {{[0-9]+}}(%esp)
++; X32-NEXT:    fstps {{[0-9]+}}(%esp)
++; X32-NEXT:    flds {{[0-9]+}}(%esp)
++; X32-NEXT:    fstps (%esp)
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+ ; X32-NEXT:    flds {{[0-9]+}}(%esp)
+ ; X32-NEXT:    flds {{[0-9]+}}(%esp)
+@@ -466,20 +483,24 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
+ ; X32-NEXT:    andb %cl, %dl
+ ; X32-NEXT:    movzbl %dl, %ecx
+ ; X32-NEXT:    negl %ecx
+-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
++; X32-NEXT:    movl (%esp), %edx
+ ; X32-NEXT:    xorl %eax, %edx
+ ; X32-NEXT:    andl %ecx, %edx
+ ; X32-NEXT:    xorl %eax, %edx
+-; X32-NEXT:    movl %edx, (%esp)
+-; X32-NEXT:    flds (%esp)
+-; X32-NEXT:    popl %eax
++; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
++; X32-NEXT:    flds {{[0-9]+}}(%esp)
++; X32-NEXT:    addl $12, %esp
+ ; X32-NEXT:    .cfi_def_cfa_offset 4
+ ; X32-NEXT:    retl
+ ;
+ ; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq:
+ ; X32-NOCMOV:       # %bb.0:
+-; X32-NOCMOV-NEXT:    pushl %eax
+-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
++; X32-NOCMOV-NEXT:    subl $12, %esp
++; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
++; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT:    fstps {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT:    fstps (%esp)
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+ ; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+@@ -492,13 +513,13 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
+ ; X32-NOCMOV-NEXT:    andb %al, %dl
+ ; X32-NOCMOV-NEXT:    movzbl %dl, %eax
+ ; X32-NOCMOV-NEXT:    negl %eax
+-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
++; X32-NOCMOV-NEXT:    movl (%esp), %edx
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+ ; X32-NOCMOV-NEXT:    andl %eax, %edx
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+-; X32-NOCMOV-NEXT:    movl %edx, (%esp)
+-; X32-NOCMOV-NEXT:    flds (%esp)
+-; X32-NOCMOV-NEXT:    popl %eax
++; X32-NOCMOV-NEXT:    movl %edx, {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT:    addl $12, %esp
+ ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+ ; X32-NOCMOV-NEXT:    retl
+   %cond = fcmp oeq float %x, %y
+@@ -522,12 +543,13 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+ ; X32-LABEL: test_ctselect_load:
+ ; X32:       # %bb.0:
+ ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
++; X32-NEXT:    andb $1, %al
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT:    movl (%edx), %edx
+ ; X32-NEXT:    movl (%ecx), %ecx
+ ; X32-NEXT:    xorl %edx, %ecx
+-; X32-NEXT:    andl $1, %eax
++; X32-NEXT:    movzbl %al, %eax
+ ; X32-NEXT:    negl %eax
+ ; X32-NEXT:    andl %ecx, %eax
+ ; X32-NEXT:    xorl %edx, %eax
+@@ -536,12 +558,13 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+ ; X32-NOCMOV-LABEL: test_ctselect_load:
+ ; X32-NOCMOV:       # %bb.0:
+ ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT:    andb $1, %al
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT:    movl (%edx), %edx
+ ; X32-NOCMOV-NEXT:    movl (%ecx), %ecx
+ ; X32-NOCMOV-NEXT:    xorl %edx, %ecx
+-; X32-NOCMOV-NEXT:    andl $1, %eax
++; X32-NOCMOV-NEXT:    movzbl %al, %eax
+ ; X32-NOCMOV-NEXT:    negl %eax
+ ; X32-NOCMOV-NEXT:    andl %ecx, %eax
+ ; X32-NOCMOV-NEXT:    xorl %edx, %eax
+@@ -578,17 +601,19 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+ ; X32-NEXT:    .cfi_offset %esi, -12
+ ; X32-NEXT:    .cfi_offset %edi, -8
+ ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
++; X32-NEXT:    andb $1, %al
++; X32-NEXT:    movb {{[0-9]+}}(%esp), %ah
++; X32-NEXT:    andb $1, %ah
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+-; X32-NEXT:    xorl %edx, %edi
+-; X32-NEXT:    andl $1, %esi
+-; X32-NEXT:    negl %esi
+-; X32-NEXT:    andl %edi, %esi
++; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
++; X32-NEXT:    xorl %edx, %esi
++; X32-NEXT:    movzbl %ah, %edi
++; X32-NEXT:    negl %edi
++; X32-NEXT:    andl %esi, %edi
+ ; X32-NEXT:    xorl %ecx, %edx
+-; X32-NEXT:    xorl %esi, %edx
+-; X32-NEXT:    andl $1, %eax
++; X32-NEXT:    xorl %edi, %edx
++; X32-NEXT:    movzbl %al, %eax
+ ; X32-NEXT:    negl %eax
+ ; X32-NEXT:    andl %edx, %eax
+ ; X32-NEXT:    xorl %ecx, %eax
+@@ -607,17 +632,19 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+ ; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+ ; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
+ ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT:    andb $1, %al
++; X32-NOCMOV-NEXT:    movb {{[0-9]+}}(%esp), %ah
++; X32-NOCMOV-NEXT:    andb $1, %ah
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+-; X32-NOCMOV-NEXT:    xorl %edx, %edi
+-; X32-NOCMOV-NEXT:    andl $1, %esi
+-; X32-NOCMOV-NEXT:    negl %esi
+-; X32-NOCMOV-NEXT:    andl %edi, %esi
++; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
++; X32-NOCMOV-NEXT:    xorl %edx, %esi
++; X32-NOCMOV-NEXT:    movzbl %ah, %edi
++; X32-NOCMOV-NEXT:    negl %edi
++; X32-NOCMOV-NEXT:    andl %esi, %edi
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+-; X32-NOCMOV-NEXT:    xorl %esi, %edx
+-; X32-NOCMOV-NEXT:    andl $1, %eax
++; X32-NOCMOV-NEXT:    xorl %edi, %edx
++; X32-NOCMOV-NEXT:    movzbl %al, %eax
+ ; X32-NOCMOV-NEXT:    negl %eax
+ ; X32-NOCMOV-NEXT:    andl %edx, %eax
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+@@ -651,10 +678,10 @@ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+ ; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
+-; X32-NEXT:    movzbl %al, %eax
++; X32-NEXT:    andb $1, %al
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT:    xorl %ecx, %edx
+-; X32-NEXT:    andl $1, %eax
++; X32-NEXT:    movzbl %al, %eax
+ ; X32-NEXT:    negl %eax
+ ; X32-NEXT:    andl %edx, %eax
+ ; X32-NEXT:    xorl %ecx, %eax
+@@ -665,10 +692,10 @@ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+ ; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
+-; X32-NOCMOV-NEXT:    movzbl %al, %eax
++; X32-NOCMOV-NEXT:    andb $1, %al
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+-; X32-NOCMOV-NEXT:    andl $1, %eax
++; X32-NOCMOV-NEXT:    movzbl %al, %eax
+ ; X32-NOCMOV-NEXT:    negl %eax
+ ; X32-NOCMOV-NEXT:    andl %edx, %eax
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+@@ -699,10 +726,10 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+ ; X32-NEXT:    orb {{[0-9]+}}(%esp), %al
+-; X32-NEXT:    movzbl %al, %eax
++; X32-NEXT:    andb $1, %al
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT:    xorl %ecx, %edx
+-; X32-NEXT:    andl $1, %eax
++; X32-NEXT:    movzbl %al, %eax
+ ; X32-NEXT:    negl %eax
+ ; X32-NEXT:    andl %edx, %eax
+ ; X32-NEXT:    xorl %ecx, %eax
+@@ -713,10 +740,10 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+ ; X32-NOCMOV-NEXT:    orb {{[0-9]+}}(%esp), %al
+-; X32-NOCMOV-NEXT:    movzbl %al, %eax
++; X32-NOCMOV-NEXT:    andb $1, %al
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+-; X32-NOCMOV-NEXT:    andl $1, %eax
++; X32-NOCMOV-NEXT:    movzbl %al, %eax
+ ; X32-NOCMOV-NEXT:    negl %eax
+ ; X32-NOCMOV-NEXT:    andl %edx, %eax
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+@@ -735,9 +762,9 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) {
+ ; X64-LABEL: test_ctselect_double_nested_and_i1:
+ ; X64:       # %bb.0:
+-; X64-NEXT:    movl %esi, %eax
++; X64-NEXT:    movl %edi, %eax
++; X64-NEXT:    andl %esi, %eax
+ ; X64-NEXT:    andl %edx, %eax
+-; X64-NEXT:    andl %edi, %eax
+ ; X64-NEXT:    xorl %r8d, %ecx
+ ; X64-NEXT:    andl $1, %eax
+ ; X64-NEXT:    negl %eax
+@@ -751,10 +778,10 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
+ ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+ ; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
+ ; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
+-; X32-NEXT:    movzbl %al, %eax
++; X32-NEXT:    andb $1, %al
+ ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT:    xorl %ecx, %edx
+-; X32-NEXT:    andl $1, %eax
++; X32-NEXT:    movzbl %al, %eax
+ ; X32-NEXT:    negl %eax
+ ; X32-NEXT:    andl %edx, %eax
+ ; X32-NEXT:    xorl %ecx, %eax
+@@ -766,10 +793,10 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
+ ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+ ; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
+ ; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
+-; X32-NOCMOV-NEXT:    movzbl %al, %eax
++; X32-NOCMOV-NEXT:    andb $1, %al
+ ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+-; X32-NOCMOV-NEXT:    andl $1, %eax
++; X32-NOCMOV-NEXT:    movzbl %al, %eax
+ ; X32-NOCMOV-NEXT:    negl %eax
+ ; X32-NOCMOV-NEXT:    andl %edx, %eax
+ ; X32-NOCMOV-NEXT:    xorl %ecx, %eax
+@@ -1403,7 +1430,7 @@ define float @test_ctselect_f32_nan_inf(i1 %cond) {
+ ; X64-NEXT:    andl $1, %edi
+ ; X64-NEXT:    negl %edi
+ ; X64-NEXT:    andl $4194304, %edi # imm = 0x400000
+-; X64-NEXT:    xorl $2139095040, %edi # imm = 0x7F800000
++; X64-NEXT:    orl $2139095040, %edi # imm = 0x7F800000
+ ; X64-NEXT:    movd %edi, %xmm0
+ ; X64-NEXT:    retq
+ ;
+@@ -1412,10 +1439,11 @@ define float @test_ctselect_f32_nan_inf(i1 %cond) {
+ ; X32-NEXT:    pushl %eax
+ ; X32-NEXT:    .cfi_def_cfa_offset 8
+ ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+-; X32-NEXT:    andl $1, %eax
++; X32-NEXT:    andb $1, %al
++; X32-NEXT:    movzbl %al, %eax
+ ; X32-NEXT:    negl %eax
+ ; X32-NEXT:    andl $4194304, %eax # imm = 0x400000
+-; X32-NEXT:    xorl $2139095040, %eax # imm = 0x7F800000
++; X32-NEXT:    orl $2139095040, %eax # imm = 0x7F800000
+ ; X32-NEXT:    movl %eax, (%esp)
+ ; X32-NEXT:    flds (%esp)
+ ; X32-NEXT:    popl %eax
+@@ -1427,10 +1455,11 @@ define float @test_ctselect_f32_nan_inf(i1 %cond) {
+ ; X32-NOCMOV-NEXT:    pushl %eax
+ ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+ ; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+-; X32-NOCMOV-NEXT:    andl $1, %eax
++; X32-NOCMOV-NEXT:    andb $1, %al
++; X32-NOCMOV-NEXT:    movzbl %al, %eax
+ ; X32-NOCMOV-NEXT:    negl %eax
+ ; X32-NOCMOV-NEXT:    andl $4194304, %eax # imm = 0x400000
+-; X32-NOCMOV-NEXT:    xorl $2139095040, %eax # imm = 0x7F800000
++; X32-NOCMOV-NEXT:    orl $2139095040, %eax # imm = 0x7F800000
+ ; X32-NOCMOV-NEXT:    movl %eax, (%esp)
+ ; X32-NOCMOV-NEXT:    flds (%esp)
+ ; X32-NOCMOV-NEXT:    popl %eax
+@@ -1449,7 +1478,7 @@ define double @test_ctselect_f64_nan_inf(i1 %cond) {
+ ; X64-NEXT:    movabsq $2251799813685248, %rax # imm = 0x8000000000000
+ ; X64-NEXT:    andq %rdi, %rax
+ ; X64-NEXT:    movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+-; X64-NEXT:    xorq %rax, %rcx
++; X64-NEXT:    orq %rax, %rcx
+ ; X64-NEXT:    movq %rcx, %xmm0
+ ; X64-NEXT:    retq
+ ;

>From 257f7d254671bc21d342c28d4660b6ba7a23ea56 Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 23:56:12 -0500
Subject: [PATCH 2/2] [LLVM][X86] Add f80 support for ct.select

Add special handling for x86_fp80 types in CTSELECT lowering by splitting
them into three 32-bit chunks, performing constant-time selection on each
chunk, and reassembling the result. This fixes crashes when compiling
tests with f80 types.

Also updated ctselect.ll to match current generic fallback implementation.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp   |  63 ++
 llvm/lib/Target/X86/X86InstrInfo.cpp      | 919 +++++++++++-----------
 llvm/lib/Target/X86/X86InstrInfo.h        |  21 +-
 llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 272 +++----
 4 files changed, 663 insertions(+), 612 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 401c1953323f4..7a3bb3c648fbb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -26170,6 +26170,69 @@ SDValue X86TargetLowering::LowerCT_SELECT(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getBitcast(VT, CtSelect);
   }
 
+  // Handle f80 types by splitting into three 32-bit chunks
+  if (VT == MVT::f80) {
+    SDValue Chain = DAG.getEntryNode();
+
+    // Create temporary stack slots for input f80 values
+    SDValue TrueSlot = DAG.CreateStackTemporary(MVT::f80);
+    SDValue FalseSlot = DAG.CreateStackTemporary(MVT::f80);
+
+    // Store f80 values to memory
+    SDValue StoreTrueF80 =
+        DAG.getStore(Chain, DL, TrueOp, TrueSlot, MachinePointerInfo());
+    SDValue StoreFalseF80 =
+        DAG.getStore(Chain, DL, FalseOp, FalseSlot, MachinePointerInfo());
+
+    // Load i32 parts from memory (3 chunks for 96-bit f80 storage)
+    SDValue TruePart0 =
+        DAG.getLoad(MVT::i32, DL, StoreTrueF80, TrueSlot, MachinePointerInfo());
+    SDValue TruePart1Ptr =
+        DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(4), DL);
+    SDValue TruePart1 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart1Ptr,
+                                    MachinePointerInfo());
+    SDValue TruePart2Ptr =
+        DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(8), DL);
+    SDValue TruePart2 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart2Ptr,
+                                    MachinePointerInfo());
+
+    SDValue FalsePart0 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalseSlot,
+                                     MachinePointerInfo());
+    SDValue FalsePart1Ptr =
+        DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(4), DL);
+    SDValue FalsePart1 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart1Ptr,
+                                     MachinePointerInfo());
+    SDValue FalsePart2Ptr =
+        DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(8), DL);
+    SDValue FalsePart2 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart2Ptr,
+                                     MachinePointerInfo());
+
+    // Perform CT_SELECT on each 32-bit chunk
+    SDValue Part0Ops[] = {FalsePart0, TruePart0, CC, ProcessedCond};
+    SDValue Part0Select = DAG.getNode(X86ISD::CT_SELECT, DL, MVT::i32, Part0Ops);
+    SDValue Part1Ops[] = {FalsePart1, TruePart1, CC, ProcessedCond};
+    SDValue Part1Select = DAG.getNode(X86ISD::CT_SELECT, DL, MVT::i32, Part1Ops);
+    SDValue Part2Ops[] = {FalsePart2, TruePart2, CC, ProcessedCond};
+    SDValue Part2Select = DAG.getNode(X86ISD::CT_SELECT, DL, MVT::i32, Part2Ops);
+
+    // Create result stack slot and store the selected parts
+    SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f80);
+    SDValue StorePart0 =
+        DAG.getStore(Chain, DL, Part0Select, ResultSlot, MachinePointerInfo());
+    SDValue ResPart1Ptr =
+        DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL);
+    SDValue StorePart1 = DAG.getStore(StorePart0, DL, Part1Select, ResPart1Ptr,
+                                      MachinePointerInfo());
+    SDValue ResPart2Ptr =
+        DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(8), DL);
+    SDValue StorePart2 = DAG.getStore(StorePart1, DL, Part2Select, ResPart2Ptr,
+                                      MachinePointerInfo());
+
+    // Load complete f80 result from memory
+    return DAG.getLoad(MVT::f80, DL, StorePart2, ResultSlot,
+                       MachinePointerInfo());
+  }
+
   // Create final CT_SELECT node
   SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
   return DAG.getNode(X86ISD::CT_SELECT, DL, Op.getValueType(), Ops,
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index d4a46048a1d20..f98501da82104 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -689,8 +689,7 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
                     .addImm(31));
   } else {
     // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax)
-    recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR)
-                    .addReg(TmpGPR));
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR));
   }
 
   // Broadcast to TmpX (vector mask)
@@ -847,7 +846,8 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
                     .setMIFlags(MachineInstr::MIFlag::NoMerge));
   }
 
-  assert(FirstInstr && LastInstr && "Expected at least one expanded instruction");
+  assert(FirstInstr && LastInstr &&
+         "Expected at least one expanded instruction");
   auto BundleEnd = LastInstr->getIterator();
   finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd));
 
@@ -915,25 +915,28 @@ bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const {
 
 /// Expand i386-specific CT_SELECT pseudo instructions (post-RA, constant-time)
 /// These internal pseudos receive a pre-materialized condition byte from the
-/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization.
+/// custom inserter, avoiding EFLAGS corruption issues during i64 type
+/// legalization.
 bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
   MachineBasicBlock *MBB = MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
   // CT_SELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask),
   // (ins src1, src2, cond_byte)
-  // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent
+  // Note: cond_byte is pre-materialized by custom inserter, not
+  // EFLAGS-dependent
   Register DstReg = MI.getOperand(0).getReg();
   Register TmpByteReg = MI.getOperand(1).getReg();
   Register TmpMaskReg = MI.getOperand(2).getReg();
   Register Src1Reg = MI.getOperand(3).getReg();
   Register Src2Reg = MI.getOperand(4).getReg();
-  Register CondByteReg = MI.getOperand(5).getReg();  // Pre-materialized condition byte
+  Register CondByteReg =
+      MI.getOperand(5).getReg(); // Pre-materialized condition byte
 
   // Determine instruction opcodes based on register width
   unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp;
   if (MI.getOpcode() == X86::CT_SELECT_I386_INT_GR8rr) {
-    MovZXOp = 0;  // No zero-extend needed for GR8
+    MovZXOp = 0; // No zero-extend needed for GR8
     NegOp = X86::NEG8r;
     MovOp = X86::MOV8rr;
     AndOp = X86::AND8rr;
@@ -962,8 +965,8 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
   // Step 1: Copy pre-materialized condition byte to TmpByteReg
   // This allows the bundle to work with allocated temporaries
   auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg)
-      .addReg(CondByteReg)
-      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+                .addReg(CondByteReg)
+                .setMIFlag(MachineInstr::MIFlag::NoMerge);
   auto BundleStart = I1->getIterator();
 
   // Step 2: Zero-extend condition byte to register width (0 or 1)
@@ -974,7 +977,9 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
   }
 
   // Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...)
-  Register MaskReg = (MI.getOpcode() == X86::CT_SELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg;
+  Register MaskReg = (MI.getOpcode() == X86::CT_SELECT_I386_INT_GR8rr)
+                         ? TmpByteReg
+                         : TmpMaskReg;
   BuildMI(*MBB, MI, DL, get(NegOp), MaskReg)
       .addReg(MaskReg)
       .setMIFlag(MachineInstr::MIFlag::NoMerge);
@@ -1002,9 +1007,9 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
 
   // Step 8: Final result: (src1 & mask) | (src2 & ~mask)
   auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg)
-      .addReg(DstReg)
-      .addReg(MaskReg)
-      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+                .addReg(DstReg)
+                .addReg(MaskReg)
+                .setMIFlag(MachineInstr::MIFlag::NoMerge);
 
   // Bundle all generated instructions for atomic execution before removing MI
   auto BundleEnd = std::next(LI->getIterator());
@@ -1013,11 +1018,12 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
     finalizeBundle(*MBB, BundleStart, BundleEnd);
   }
 
-  // TODO: Optimization opportunity - The register allocator may choose callee-saved
-  // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary
-  // save/restore overhead. Consider constraining these to caller-saved register
-  // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve
-  // constant-time performance by eliminating prologue/epilogue instructions.
+  // TODO: Optimization opportunity - The register allocator may choose
+  // callee-saved registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg,
+  // causing unnecessary save/restore overhead. Consider constraining these to
+  // caller-saved register classes (e.g., GR8_AL, GR32_CallSaved) in the
+  // TableGen definitions to improve constant-time performance by eliminating
+  // prologue/epilogue instructions.
 
   // Remove the original pseudo instruction
   MI.eraseFromParent();
@@ -1305,8 +1311,7 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
   return isPICBase;
 }
 
-bool X86InstrInfo::isReMaterializableImpl(
-    const MachineInstr &MI) const {
+bool X86InstrInfo::isReMaterializableImpl(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   default:
     // This function should only be called for opcodes with the ReMaterializable
@@ -1823,32 +1828,32 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   switch (MIOpc) {
   default:
     llvm_unreachable("Unreachable!");
-  CASE_NF(SHL8ri)
-  CASE_NF(SHL16ri) {
-    unsigned ShAmt = MI.getOperand(2).getImm();
-    MIB.addReg(0)
-        .addImm(1LL << ShAmt)
-        .addReg(InRegLEA, RegState::Kill)
-        .addImm(0)
-        .addReg(0);
-    break;
-  }
-  CASE_NF(INC8r)
-  CASE_NF(INC16r)
+    CASE_NF(SHL8ri)
+    CASE_NF(SHL16ri) {
+      unsigned ShAmt = MI.getOperand(2).getImm();
+      MIB.addReg(0)
+          .addImm(1LL << ShAmt)
+          .addReg(InRegLEA, RegState::Kill)
+          .addImm(0)
+          .addReg(0);
+      break;
+    }
+    CASE_NF(INC8r)
+    CASE_NF(INC16r)
     addRegOffset(MIB, InRegLEA, true, 1);
     break;
-  CASE_NF(DEC8r)
-  CASE_NF(DEC16r)
+    CASE_NF(DEC8r)
+    CASE_NF(DEC16r)
     addRegOffset(MIB, InRegLEA, true, -1);
     break;
-  CASE_NF(ADD8ri)
-  CASE_NF(ADD16ri)
+    CASE_NF(ADD8ri)
+    CASE_NF(ADD16ri)
   case X86::ADD8ri_DB:
   case X86::ADD16ri_DB:
     addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
     break;
-  CASE_NF(ADD8rr)
-  CASE_NF(ADD16rr)
+    CASE_NF(ADD8rr)
+    CASE_NF(ADD16rr)
   case X86::ADD8rr_DB:
   case X86::ADD16rr_DB: {
     Src2 = MI.getOperand(2).getReg();
@@ -1986,128 +1991,129 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
   switch (MIOpc) {
   default:
     llvm_unreachable("Unreachable!");
-  CASE_NF(SHL64ri) {
-    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
-    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt))
-      return nullptr;
-
-    // LEA can't handle RSP.
-    if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
-                                        Src.getReg(), &X86::GR64_NOSPRegClass))
-      return nullptr;
+    CASE_NF(SHL64ri) {
+      assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+      unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+      if (!isTruncatedShiftCountForLEA(ShAmt))
+        return nullptr;
 
-    NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
-                .add(Dest)
-                .addReg(0)
-                .addImm(1LL << ShAmt)
-                .add(Src)
-                .addImm(0)
-                .addReg(0);
-    break;
-  }
-  CASE_NF(SHL32ri) {
-    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
-    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt))
-      return nullptr;
+      // LEA can't handle RSP.
+      if (Src.getReg().isVirtual() &&
+          !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                             &X86::GR64_NOSPRegClass))
+        return nullptr;
 
-    unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+      NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
+                  .add(Dest)
+                  .addReg(0)
+                  .addImm(1LL << ShAmt)
+                  .add(Src)
+                  .addImm(0)
+                  .addReg(0);
+      break;
+    }
+    CASE_NF(SHL32ri) {
+      assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+      unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+      if (!isTruncatedShiftCountForLEA(ShAmt))
+        return nullptr;
 
-    // LEA can't handle ESP.
-    bool isKill;
-    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
-                        isKill, ImplicitOp, LV, LIS))
-      return nullptr;
+      unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
-    MachineInstrBuilder MIB =
-        BuildMI(MF, MI.getDebugLoc(), get(Opc))
-            .add(Dest)
-            .addReg(0)
-            .addImm(1LL << ShAmt)
-            .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
-            .addImm(0)
-            .addReg(0);
-    if (ImplicitOp.getReg() != 0)
-      MIB.add(ImplicitOp);
-    NewMI = MIB;
+      // LEA can't handle ESP.
+      bool isKill;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+                          isKill, ImplicitOp, LV, LIS))
+        return nullptr;
 
-    // Add kills if classifyLEAReg created a new register.
-    if (LV && SrcReg != Src.getReg())
-      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
-    break;
-  }
-  CASE_NF(SHL8ri)
+      MachineInstrBuilder MIB =
+          BuildMI(MF, MI.getDebugLoc(), get(Opc))
+              .add(Dest)
+              .addReg(0)
+              .addImm(1LL << ShAmt)
+              .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
+              .addImm(0)
+              .addReg(0);
+      if (ImplicitOp.getReg() != 0)
+        MIB.add(ImplicitOp);
+      NewMI = MIB;
+
+      // Add kills if classifyLEAReg created a new register.
+      if (LV && SrcReg != Src.getReg())
+        LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+      break;
+    }
+    CASE_NF(SHL8ri)
     Is8BitOp = true;
     [[fallthrough]];
-  CASE_NF(SHL16ri) {
-    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
-    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt))
-      return nullptr;
-    return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
-  }
-  CASE_NF(INC64r)
-  CASE_NF(INC32r) {
-    assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
-    unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
-                       ? X86::LEA64r
-                       : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
-    bool isKill;
-    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
-                        isKill, ImplicitOp, LV, LIS))
-      return nullptr;
-
-    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-                                  .add(Dest)
-                                  .addReg(SrcReg, getKillRegState(isKill));
-    if (ImplicitOp.getReg() != 0)
-      MIB.add(ImplicitOp);
+    CASE_NF(SHL16ri) {
+      assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+      unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+      if (!isTruncatedShiftCountForLEA(ShAmt))
+        return nullptr;
+      return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
+    }
+    CASE_NF(INC64r)
+    CASE_NF(INC32r) {
+      assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
+      unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
+                         ? X86::LEA64r
+                         : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      bool isKill;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+                          isKill, ImplicitOp, LV, LIS))
+        return nullptr;
 
-    NewMI = addOffset(MIB, 1);
+      MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+                                    .add(Dest)
+                                    .addReg(SrcReg, getKillRegState(isKill));
+      if (ImplicitOp.getReg() != 0)
+        MIB.add(ImplicitOp);
 
-    // Add kills if classifyLEAReg created a new register.
-    if (LV && SrcReg != Src.getReg())
-      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
-    break;
-  }
-  CASE_NF(DEC64r)
-  CASE_NF(DEC32r) {
-    assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
-    unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
-                       ? X86::LEA64r
-                       : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      NewMI = addOffset(MIB, 1);
 
-    bool isKill;
-    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
-                        isKill, ImplicitOp, LV, LIS))
-      return nullptr;
+      // Add kills if classifyLEAReg created a new register.
+      if (LV && SrcReg != Src.getReg())
+        LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+      break;
+    }
+    CASE_NF(DEC64r)
+    CASE_NF(DEC32r) {
+      assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
+      unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
+                         ? X86::LEA64r
+                         : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+
+      bool isKill;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+                          isKill, ImplicitOp, LV, LIS))
+        return nullptr;
 
-    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-                                  .add(Dest)
-                                  .addReg(SrcReg, getKillRegState(isKill));
-    if (ImplicitOp.getReg() != 0)
-      MIB.add(ImplicitOp);
+      MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+                                    .add(Dest)
+                                    .addReg(SrcReg, getKillRegState(isKill));
+      if (ImplicitOp.getReg() != 0)
+        MIB.add(ImplicitOp);
 
-    NewMI = addOffset(MIB, -1);
+      NewMI = addOffset(MIB, -1);
 
-    // Add kills if classifyLEAReg created a new register.
-    if (LV && SrcReg != Src.getReg())
-      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
-    break;
-  }
-  CASE_NF(DEC8r)
-  CASE_NF(INC8r)
+      // Add kills if classifyLEAReg created a new register.
+      if (LV && SrcReg != Src.getReg())
+        LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+      break;
+    }
+    CASE_NF(DEC8r)
+    CASE_NF(INC8r)
     Is8BitOp = true;
     [[fallthrough]];
-  CASE_NF(DEC16r)
-  CASE_NF(INC16r)
+    CASE_NF(DEC16r)
+    CASE_NF(INC16r)
     return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
-  CASE_NF(ADD64rr)
-  CASE_NF(ADD32rr)
+    CASE_NF(ADD64rr)
+    CASE_NF(ADD32rr)
   case X86::ADD64rr_DB:
   case X86::ADD32rr_DB: {
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
@@ -2158,21 +2164,21 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
     NumRegOperands = 3;
     break;
   }
-  CASE_NF(ADD8rr)
+    CASE_NF(ADD8rr)
   case X86::ADD8rr_DB:
     Is8BitOp = true;
     [[fallthrough]];
-  CASE_NF(ADD16rr)
+    CASE_NF(ADD16rr)
   case X86::ADD16rr_DB:
     return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
-  CASE_NF(ADD64ri32)
+    CASE_NF(ADD64ri32)
   case X86::ADD64ri32_DB:
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
     NewMI = addOffset(
         BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
         MI.getOperand(2));
     break;
-  CASE_NF(ADD32ri)
+    CASE_NF(ADD32ri)
   case X86::ADD32ri_DB: {
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
     unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
@@ -2197,62 +2203,62 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
       LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
     break;
   }
-  CASE_NF(ADD8ri)
+    CASE_NF(ADD8ri)
   case X86::ADD8ri_DB:
     Is8BitOp = true;
     [[fallthrough]];
-  CASE_NF(ADD16ri)
+    CASE_NF(ADD16ri)
   case X86::ADD16ri_DB:
     return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
-  CASE_NF(SUB8ri)
-  CASE_NF(SUB16ri)
+    CASE_NF(SUB8ri)
+    CASE_NF(SUB16ri)
     /// FIXME: Support these similar to ADD8ri/ADD16ri*.
     return nullptr;
-  CASE_NF(SUB32ri) {
-    if (!MI.getOperand(2).isImm())
-      return nullptr;
-    int64_t Imm = MI.getOperand(2).getImm();
-    if (!isInt<32>(-Imm))
-      return nullptr;
+    CASE_NF(SUB32ri) {
+      if (!MI.getOperand(2).isImm())
+        return nullptr;
+      int64_t Imm = MI.getOperand(2).getImm();
+      if (!isInt<32>(-Imm))
+        return nullptr;
 
-    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
-    unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+      assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
-    bool isKill;
-    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
-                        isKill, ImplicitOp, LV, LIS))
-      return nullptr;
+      bool isKill;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
+                          isKill, ImplicitOp, LV, LIS))
+        return nullptr;
 
-    MachineInstrBuilder MIB =
-        BuildMI(MF, MI.getDebugLoc(), get(Opc))
-            .add(Dest)
-            .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
-    if (ImplicitOp.getReg() != 0)
-      MIB.add(ImplicitOp);
+      MachineInstrBuilder MIB =
+          BuildMI(MF, MI.getDebugLoc(), get(Opc))
+              .add(Dest)
+              .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
+      if (ImplicitOp.getReg() != 0)
+        MIB.add(ImplicitOp);
 
-    NewMI = addOffset(MIB, -Imm);
+      NewMI = addOffset(MIB, -Imm);
 
-    // Add kills if classifyLEAReg created a new register.
-    if (LV && SrcReg != Src.getReg())
-      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
-    break;
-  }
+      // Add kills if classifyLEAReg created a new register.
+      if (LV && SrcReg != Src.getReg())
+        LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+      break;
+    }
 
-  CASE_NF(SUB64ri32) {
-    if (!MI.getOperand(2).isImm())
-      return nullptr;
-    int64_t Imm = MI.getOperand(2).getImm();
-    if (!isInt<32>(-Imm))
-      return nullptr;
+    CASE_NF(SUB64ri32) {
+      if (!MI.getOperand(2).isImm())
+        return nullptr;
+      int64_t Imm = MI.getOperand(2).getImm();
+      if (!isInt<32>(-Imm))
+        return nullptr;
 
-    assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
+      assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
 
-    MachineInstrBuilder MIB =
-        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
-    NewMI = addOffset(MIB, -Imm);
-    break;
-  }
+      MachineInstrBuilder MIB =
+          BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
+      NewMI = addOffset(MIB, -Imm);
+      break;
+    }
 
   case X86::VMOVDQU8Z128rmk:
   case X86::VMOVDQU8Z256rmk:
@@ -2852,17 +2858,17 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::OP##_ND:
 
   switch (Opc) {
-  // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
-  CASE_ND(SHRD16rri8)
-  CASE_ND(SHLD16rri8)
-  CASE_ND(SHRD32rri8)
-  CASE_ND(SHLD32rri8)
-  CASE_ND(SHRD64rri8)
-  CASE_ND(SHLD64rri8) {
-    unsigned Size;
-    switch (Opc) {
-    default:
-      llvm_unreachable("Unreachable!");
+    // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
+    CASE_ND(SHRD16rri8)
+    CASE_ND(SHLD16rri8)
+    CASE_ND(SHRD32rri8)
+    CASE_ND(SHLD32rri8)
+    CASE_ND(SHRD64rri8)
+    CASE_ND(SHLD64rri8) {
+      unsigned Size;
+      switch (Opc) {
+      default:
+        llvm_unreachable("Unreachable!");
 #define FROM_TO_SIZE(A, B, S)                                                  \
   case X86::A:                                                                 \
     Opc = X86::B;                                                              \
@@ -2881,16 +2887,16 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     Size = S;                                                                  \
     break;
 
-    FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
-    FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
-    FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
+        FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
+        FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
+        FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
 #undef FROM_TO_SIZE
+      }
+      WorkingMI = CloneIfNew(MI);
+      WorkingMI->setDesc(get(Opc));
+      WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
+      break;
     }
-    WorkingMI = CloneIfNew(MI);
-    WorkingMI->setDesc(get(Opc));
-    WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
-    break;
-  }
   case X86::PFSUBrr:
   case X86::PFSUBRrr:
     // PFSUB  x, y: x = x - y
@@ -3174,15 +3180,16 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     WorkingMI = CloneIfNew(MI);
     WorkingMI->setDesc(get(Opc));
     break;
-  CASE_ND(CMOV16rr)
-  CASE_ND(CMOV32rr)
-  CASE_ND(CMOV64rr) {
-    WorkingMI = CloneIfNew(MI);
-    unsigned OpNo = MI.getDesc().getNumOperands() - 1;
-    X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
-    WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
-    break;
-  }
+    CASE_ND(CMOV16rr)
+    CASE_ND(CMOV32rr)
+    CASE_ND(CMOV64rr) {
+      WorkingMI = CloneIfNew(MI);
+      unsigned OpNo = MI.getDesc().getNumOperands() - 1;
+      X86::CondCode CC =
+          static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
+      WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
+      break;
+    }
   case X86::VPTERNLOGDZrri:
   case X86::VPTERNLOGDZrmi:
   case X86::VPTERNLOGDZ128rri:
@@ -5391,29 +5398,29 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
       CmpMask = CmpValue = 0;
     }
     return true;
-  // A SUB can be used to perform comparison.
-  CASE_ND(SUB64rm)
-  CASE_ND(SUB32rm)
-  CASE_ND(SUB16rm)
-  CASE_ND(SUB8rm)
+    // A SUB can be used to perform comparison.
+    CASE_ND(SUB64rm)
+    CASE_ND(SUB32rm)
+    CASE_ND(SUB16rm)
+    CASE_ND(SUB8rm)
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = 0;
     CmpValue = 0;
     return true;
-  CASE_ND(SUB64rr)
-  CASE_ND(SUB32rr)
-  CASE_ND(SUB16rr)
-  CASE_ND(SUB8rr)
+    CASE_ND(SUB64rr)
+    CASE_ND(SUB32rr)
+    CASE_ND(SUB16rr)
+    CASE_ND(SUB8rr)
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = MI.getOperand(2).getReg();
     CmpMask = 0;
     CmpValue = 0;
     return true;
-  CASE_ND(SUB64ri32)
-  CASE_ND(SUB32ri)
-  CASE_ND(SUB16ri)
-  CASE_ND(SUB8ri)
+    CASE_ND(SUB64ri32)
+    CASE_ND(SUB32ri)
+    CASE_ND(SUB16ri)
+    CASE_ND(SUB8ri)
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     if (MI.getOperand(2).isImm()) {
@@ -5468,27 +5475,27 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
   case X86::CMP32rr:
   case X86::CMP16rr:
   case X86::CMP8rr:
-  CASE_ND(SUB64rr)
-  CASE_ND(SUB32rr)
-  CASE_ND(SUB16rr)
-  CASE_ND(SUB8rr) {
-    Register OISrcReg;
-    Register OISrcReg2;
-    int64_t OIMask;
-    int64_t OIValue;
-    if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
-        OIMask != ImmMask || OIValue != ImmValue)
+    CASE_ND(SUB64rr)
+    CASE_ND(SUB32rr)
+    CASE_ND(SUB16rr)
+    CASE_ND(SUB8rr) {
+      Register OISrcReg;
+      Register OISrcReg2;
+      int64_t OIMask;
+      int64_t OIValue;
+      if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
+          OIMask != ImmMask || OIValue != ImmValue)
+        return false;
+      if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
+        *IsSwapped = false;
+        return true;
+      }
+      if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
+        *IsSwapped = true;
+        return true;
+      }
       return false;
-    if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
-      *IsSwapped = false;
-      return true;
     }
-    if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
-      *IsSwapped = true;
-      return true;
-    }
-    return false;
-  }
   case X86::CMP64ri32:
   case X86::CMP32ri:
   case X86::CMP16ri:
@@ -5497,10 +5504,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
   case X86::TEST32ri:
   case X86::TEST16ri:
   case X86::TEST8ri:
-  CASE_ND(SUB64ri32)
-  CASE_ND(SUB32ri)
-  CASE_ND(SUB16ri)
-  CASE_ND(SUB8ri)
+    CASE_ND(SUB64ri32)
+    CASE_ND(SUB32ri)
+    CASE_ND(SUB16ri)
+    CASE_ND(SUB8ri)
   case X86::TEST64rr:
   case X86::TEST32rr:
   case X86::TEST16rr:
@@ -5557,98 +5564,98 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
   default:
     return false;
 
-  // The shift instructions only modify ZF if their shift count is non-zero.
-  // N.B.: The processor truncates the shift count depending on the encoding.
-  CASE_ND(SAR8ri)
-  CASE_ND(SAR16ri)
-  CASE_ND(SAR32ri)
-  CASE_ND(SAR64ri)
-  CASE_ND(SHR8ri)
-  CASE_ND(SHR16ri)
-  CASE_ND(SHR32ri)
-  CASE_ND(SHR64ri)
+    // The shift instructions only modify ZF if their shift count is non-zero.
+    // N.B.: The processor truncates the shift count depending on the encoding.
+    CASE_ND(SAR8ri)
+    CASE_ND(SAR16ri)
+    CASE_ND(SAR32ri)
+    CASE_ND(SAR64ri)
+    CASE_ND(SHR8ri)
+    CASE_ND(SHR16ri)
+    CASE_ND(SHR32ri)
+    CASE_ND(SHR64ri)
     return getTruncatedShiftCount(MI, 2) != 0;
 
-  // Some left shift instructions can be turned into LEA instructions but only
-  // if their flags aren't used. Avoid transforming such instructions.
-  CASE_ND(SHL8ri)
-  CASE_ND(SHL16ri)
-  CASE_ND(SHL32ri)
-  CASE_ND(SHL64ri) {
-    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (isTruncatedShiftCountForLEA(ShAmt))
-      return false;
-    return ShAmt != 0;
-  }
+    // Some left shift instructions can be turned into LEA instructions but only
+    // if their flags aren't used. Avoid transforming such instructions.
+    CASE_ND(SHL8ri)
+    CASE_ND(SHL16ri)
+    CASE_ND(SHL32ri)
+    CASE_ND(SHL64ri) {
+      unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+      if (isTruncatedShiftCountForLEA(ShAmt))
+        return false;
+      return ShAmt != 0;
+    }
 
-  CASE_ND(SHRD16rri8)
-  CASE_ND(SHRD32rri8)
-  CASE_ND(SHRD64rri8)
-  CASE_ND(SHLD16rri8)
-  CASE_ND(SHLD32rri8)
-  CASE_ND(SHLD64rri8)
+    CASE_ND(SHRD16rri8)
+    CASE_ND(SHRD32rri8)
+    CASE_ND(SHRD64rri8)
+    CASE_ND(SHLD16rri8)
+    CASE_ND(SHLD32rri8)
+    CASE_ND(SHLD64rri8)
     return getTruncatedShiftCount(MI, 3) != 0;
 
-  CASE_ND(SUB64ri32)
-  CASE_ND(SUB32ri)
-  CASE_ND(SUB16ri)
-  CASE_ND(SUB8ri)
-  CASE_ND(SUB64rr)
-  CASE_ND(SUB32rr)
-  CASE_ND(SUB16rr)
-  CASE_ND(SUB8rr)
-  CASE_ND(SUB64rm)
-  CASE_ND(SUB32rm)
-  CASE_ND(SUB16rm)
-  CASE_ND(SUB8rm)
-  CASE_ND(DEC64r)
-  CASE_ND(DEC32r)
-  CASE_ND(DEC16r)
-  CASE_ND(DEC8r)
-  CASE_ND(ADD64ri32)
-  CASE_ND(ADD32ri)
-  CASE_ND(ADD16ri)
-  CASE_ND(ADD8ri)
-  CASE_ND(ADD64rr)
-  CASE_ND(ADD32rr)
-  CASE_ND(ADD16rr)
-  CASE_ND(ADD8rr)
-  CASE_ND(ADD64rm)
-  CASE_ND(ADD32rm)
-  CASE_ND(ADD16rm)
-  CASE_ND(ADD8rm)
-  CASE_ND(INC64r)
-  CASE_ND(INC32r)
-  CASE_ND(INC16r)
-  CASE_ND(INC8r)
-  CASE_ND(ADC64ri32)
-  CASE_ND(ADC32ri)
-  CASE_ND(ADC16ri)
-  CASE_ND(ADC8ri)
-  CASE_ND(ADC64rr)
-  CASE_ND(ADC32rr)
-  CASE_ND(ADC16rr)
-  CASE_ND(ADC8rr)
-  CASE_ND(ADC64rm)
-  CASE_ND(ADC32rm)
-  CASE_ND(ADC16rm)
-  CASE_ND(ADC8rm)
-  CASE_ND(SBB64ri32)
-  CASE_ND(SBB32ri)
-  CASE_ND(SBB16ri)
-  CASE_ND(SBB8ri)
-  CASE_ND(SBB64rr)
-  CASE_ND(SBB32rr)
-  CASE_ND(SBB16rr)
-  CASE_ND(SBB8rr)
-  CASE_ND(SBB64rm)
-  CASE_ND(SBB32rm)
-  CASE_ND(SBB16rm)
-  CASE_ND(SBB8rm)
-  CASE_ND(NEG8r)
-  CASE_ND(NEG16r)
-  CASE_ND(NEG32r)
-  CASE_ND(NEG64r)
+    CASE_ND(SUB64ri32)
+    CASE_ND(SUB32ri)
+    CASE_ND(SUB16ri)
+    CASE_ND(SUB8ri)
+    CASE_ND(SUB64rr)
+    CASE_ND(SUB32rr)
+    CASE_ND(SUB16rr)
+    CASE_ND(SUB8rr)
+    CASE_ND(SUB64rm)
+    CASE_ND(SUB32rm)
+    CASE_ND(SUB16rm)
+    CASE_ND(SUB8rm)
+    CASE_ND(DEC64r)
+    CASE_ND(DEC32r)
+    CASE_ND(DEC16r)
+    CASE_ND(DEC8r)
+    CASE_ND(ADD64ri32)
+    CASE_ND(ADD32ri)
+    CASE_ND(ADD16ri)
+    CASE_ND(ADD8ri)
+    CASE_ND(ADD64rr)
+    CASE_ND(ADD32rr)
+    CASE_ND(ADD16rr)
+    CASE_ND(ADD8rr)
+    CASE_ND(ADD64rm)
+    CASE_ND(ADD32rm)
+    CASE_ND(ADD16rm)
+    CASE_ND(ADD8rm)
+    CASE_ND(INC64r)
+    CASE_ND(INC32r)
+    CASE_ND(INC16r)
+    CASE_ND(INC8r)
+    CASE_ND(ADC64ri32)
+    CASE_ND(ADC32ri)
+    CASE_ND(ADC16ri)
+    CASE_ND(ADC8ri)
+    CASE_ND(ADC64rr)
+    CASE_ND(ADC32rr)
+    CASE_ND(ADC16rr)
+    CASE_ND(ADC8rr)
+    CASE_ND(ADC64rm)
+    CASE_ND(ADC32rm)
+    CASE_ND(ADC16rm)
+    CASE_ND(ADC8rm)
+    CASE_ND(SBB64ri32)
+    CASE_ND(SBB32ri)
+    CASE_ND(SBB16ri)
+    CASE_ND(SBB8ri)
+    CASE_ND(SBB64rr)
+    CASE_ND(SBB32rr)
+    CASE_ND(SBB16rr)
+    CASE_ND(SBB8rr)
+    CASE_ND(SBB64rm)
+    CASE_ND(SBB32rm)
+    CASE_ND(SBB16rm)
+    CASE_ND(SBB8rm)
+    CASE_ND(NEG8r)
+    CASE_ND(NEG16r)
+    CASE_ND(NEG32r)
+    CASE_ND(NEG64r)
   case X86::LZCNT16rr:
   case X86::LZCNT16rm:
   case X86::LZCNT32rr:
@@ -5668,42 +5675,42 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
   case X86::TZCNT64rr:
   case X86::TZCNT64rm:
     return true;
-  CASE_ND(AND64ri32)
-  CASE_ND(AND32ri)
-  CASE_ND(AND16ri)
-  CASE_ND(AND8ri)
-  CASE_ND(AND64rr)
-  CASE_ND(AND32rr)
-  CASE_ND(AND16rr)
-  CASE_ND(AND8rr)
-  CASE_ND(AND64rm)
-  CASE_ND(AND32rm)
-  CASE_ND(AND16rm)
-  CASE_ND(AND8rm)
-  CASE_ND(XOR64ri32)
-  CASE_ND(XOR32ri)
-  CASE_ND(XOR16ri)
-  CASE_ND(XOR8ri)
-  CASE_ND(XOR64rr)
-  CASE_ND(XOR32rr)
-  CASE_ND(XOR16rr)
-  CASE_ND(XOR8rr)
-  CASE_ND(XOR64rm)
-  CASE_ND(XOR32rm)
-  CASE_ND(XOR16rm)
-  CASE_ND(XOR8rm)
-  CASE_ND(OR64ri32)
-  CASE_ND(OR32ri)
-  CASE_ND(OR16ri)
-  CASE_ND(OR8ri)
-  CASE_ND(OR64rr)
-  CASE_ND(OR32rr)
-  CASE_ND(OR16rr)
-  CASE_ND(OR8rr)
-  CASE_ND(OR64rm)
-  CASE_ND(OR32rm)
-  CASE_ND(OR16rm)
-  CASE_ND(OR8rm)
+    CASE_ND(AND64ri32)
+    CASE_ND(AND32ri)
+    CASE_ND(AND16ri)
+    CASE_ND(AND8ri)
+    CASE_ND(AND64rr)
+    CASE_ND(AND32rr)
+    CASE_ND(AND16rr)
+    CASE_ND(AND8rr)
+    CASE_ND(AND64rm)
+    CASE_ND(AND32rm)
+    CASE_ND(AND16rm)
+    CASE_ND(AND8rm)
+    CASE_ND(XOR64ri32)
+    CASE_ND(XOR32ri)
+    CASE_ND(XOR16ri)
+    CASE_ND(XOR8ri)
+    CASE_ND(XOR64rr)
+    CASE_ND(XOR32rr)
+    CASE_ND(XOR16rr)
+    CASE_ND(XOR8rr)
+    CASE_ND(XOR64rm)
+    CASE_ND(XOR32rm)
+    CASE_ND(XOR16rm)
+    CASE_ND(XOR8rm)
+    CASE_ND(OR64ri32)
+    CASE_ND(OR32ri)
+    CASE_ND(OR16ri)
+    CASE_ND(OR8ri)
+    CASE_ND(OR64rr)
+    CASE_ND(OR32rr)
+    CASE_ND(OR16rr)
+    CASE_ND(OR8rr)
+    CASE_ND(OR64rm)
+    CASE_ND(OR32rm)
+    CASE_ND(OR16rm)
+    CASE_ND(OR8rm)
   case X86::ANDN32rr:
   case X86::ANDN32rm:
   case X86::ANDN64rr:
@@ -5781,15 +5788,17 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
 }
 
 /// Check whether the use can be converted to remove a comparison against zero.
-/// Returns the EFLAGS condition and the operand that we are comparing against zero.
-static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
+/// Returns the EFLAGS condition and the operand that we are comparing against
+/// zero.
+static std::pair<X86::CondCode, unsigned>
+isUseDefConvertible(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     return std::make_pair(X86::COND_INVALID, ~0U);
-  CASE_ND(NEG8r)
-  CASE_ND(NEG16r)
-  CASE_ND(NEG32r)
-  CASE_ND(NEG64r)
+    CASE_ND(NEG8r)
+    CASE_ND(NEG16r)
+    CASE_ND(NEG32r)
+    CASE_ND(NEG64r)
     return std::make_pair(X86::COND_AE, 1U);
   case X86::LZCNT16rr:
   case X86::LZCNT32rr:
@@ -5833,51 +5842,53 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   switch (CmpInstr.getOpcode()) {
   default:
     break;
-  CASE_ND(SUB64ri32)
-  CASE_ND(SUB32ri)
-  CASE_ND(SUB16ri)
-  CASE_ND(SUB8ri)
-  CASE_ND(SUB64rm)
-  CASE_ND(SUB32rm)
-  CASE_ND(SUB16rm)
-  CASE_ND(SUB8rm)
-  CASE_ND(SUB64rr)
-  CASE_ND(SUB32rr)
-  CASE_ND(SUB16rr)
-  CASE_ND(SUB8rr) {
-    if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
-      return false;
-    // There is no use of the destination register, we can replace SUB with CMP.
-    unsigned NewOpcode = 0;
+    CASE_ND(SUB64ri32)
+    CASE_ND(SUB32ri)
+    CASE_ND(SUB16ri)
+    CASE_ND(SUB8ri)
+    CASE_ND(SUB64rm)
+    CASE_ND(SUB32rm)
+    CASE_ND(SUB16rm)
+    CASE_ND(SUB8rm)
+    CASE_ND(SUB64rr)
+    CASE_ND(SUB32rr)
+    CASE_ND(SUB16rr)
+    CASE_ND(SUB8rr) {
+      if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
+        return false;
+      // There is no use of the destination register, we can replace SUB with
+      // CMP.
+      unsigned NewOpcode = 0;
 #define FROM_TO(A, B)                                                          \
   CASE_ND(A) NewOpcode = X86::B;                                               \
   break;
-    switch (CmpInstr.getOpcode()) {
-    default:
-      llvm_unreachable("Unreachable!");
-    FROM_TO(SUB64rm, CMP64rm)
-    FROM_TO(SUB32rm, CMP32rm)
-    FROM_TO(SUB16rm, CMP16rm)
-    FROM_TO(SUB8rm, CMP8rm)
-    FROM_TO(SUB64rr, CMP64rr)
-    FROM_TO(SUB32rr, CMP32rr)
-    FROM_TO(SUB16rr, CMP16rr)
-    FROM_TO(SUB8rr, CMP8rr)
-    FROM_TO(SUB64ri32, CMP64ri32)
-    FROM_TO(SUB32ri, CMP32ri)
-    FROM_TO(SUB16ri, CMP16ri)
-    FROM_TO(SUB8ri, CMP8ri)
-    }
+      switch (CmpInstr.getOpcode()) {
+      default:
+        llvm_unreachable("Unreachable!");
+        FROM_TO(SUB64rm, CMP64rm)
+        FROM_TO(SUB32rm, CMP32rm)
+        FROM_TO(SUB16rm, CMP16rm)
+        FROM_TO(SUB8rm, CMP8rm)
+        FROM_TO(SUB64rr, CMP64rr)
+        FROM_TO(SUB32rr, CMP32rr)
+        FROM_TO(SUB16rr, CMP16rr)
+        FROM_TO(SUB8rr, CMP8rr)
+        FROM_TO(SUB64ri32, CMP64ri32)
+        FROM_TO(SUB32ri, CMP32ri)
+        FROM_TO(SUB16ri, CMP16ri)
+        FROM_TO(SUB8ri, CMP8ri)
+      }
 #undef FROM_TO
-    CmpInstr.setDesc(get(NewOpcode));
-    CmpInstr.removeOperand(0);
-    // Mutating this instruction invalidates any debug data associated with it.
-    CmpInstr.dropDebugNumber();
-    // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
-    if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
-        NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
-      return false;
-  }
+      CmpInstr.setDesc(get(NewOpcode));
+      CmpInstr.removeOperand(0);
+      // Mutating this instruction invalidates any debug data associated with
+      // it.
+      CmpInstr.dropDebugNumber();
+      // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+      if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+          NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+        return false;
+    }
   }
 
   // The following code tries to remove the comparison by re-using EFLAGS
@@ -6234,14 +6245,14 @@ static bool canConvert2Copy(unsigned Opc) {
   switch (Opc) {
   default:
     return false;
-  CASE_ND(ADD64ri32)
-  CASE_ND(SUB64ri32)
-  CASE_ND(OR64ri32)
-  CASE_ND(XOR64ri32)
-  CASE_ND(ADD32ri)
-  CASE_ND(SUB32ri)
-  CASE_ND(OR32ri)
-  CASE_ND(XOR32ri)
+    CASE_ND(ADD64ri32)
+    CASE_ND(SUB64ri32)
+    CASE_ND(OR64ri32)
+    CASE_ND(XOR64ri32)
+    CASE_ND(ADD32ri)
+    CASE_ND(SUB32ri)
+    CASE_ND(OR32ri)
+    CASE_ND(XOR32ri)
     return true;
   }
 }
@@ -9656,7 +9667,7 @@ Register X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
 
 static const uint16_t *lookup(unsigned opcode, unsigned domain,
                               ArrayRef<uint16_t[3]> Table) {
-  for (const uint16_t(&Row)[3] : Table)
+  for (const uint16_t (&Row)[3] : Table)
     if (Row[domain - 1] == opcode)
       return Row;
   return nullptr;
@@ -9665,7 +9676,7 @@ static const uint16_t *lookup(unsigned opcode, unsigned domain,
 static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
                                     ArrayRef<uint16_t[4]> Table) {
   // If this is the integer domain make sure to check both integer columns.
-  for (const uint16_t(&Row)[4] : Table)
+  for (const uint16_t (&Row)[4] : Table)
     if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
       return Row;
   return nullptr;
@@ -10421,25 +10432,25 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
   if (Invert)
     return false;
   switch (Inst.getOpcode()) {
-  CASE_ND(ADD8rr)
-  CASE_ND(ADD16rr)
-  CASE_ND(ADD32rr)
-  CASE_ND(ADD64rr)
-  CASE_ND(AND8rr)
-  CASE_ND(AND16rr)
-  CASE_ND(AND32rr)
-  CASE_ND(AND64rr)
-  CASE_ND(OR8rr)
-  CASE_ND(OR16rr)
-  CASE_ND(OR32rr)
-  CASE_ND(OR64rr)
-  CASE_ND(XOR8rr)
-  CASE_ND(XOR16rr)
-  CASE_ND(XOR32rr)
-  CASE_ND(XOR64rr)
-  CASE_ND(IMUL16rr)
-  CASE_ND(IMUL32rr)
-  CASE_ND(IMUL64rr)
+    CASE_ND(ADD8rr)
+    CASE_ND(ADD16rr)
+    CASE_ND(ADD32rr)
+    CASE_ND(ADD64rr)
+    CASE_ND(AND8rr)
+    CASE_ND(AND16rr)
+    CASE_ND(AND32rr)
+    CASE_ND(AND64rr)
+    CASE_ND(OR8rr)
+    CASE_ND(OR16rr)
+    CASE_ND(OR32rr)
+    CASE_ND(OR64rr)
+    CASE_ND(XOR8rr)
+    CASE_ND(XOR16rr)
+    CASE_ND(XOR32rr)
+    CASE_ND(XOR64rr)
+    CASE_ND(IMUL16rr)
+    CASE_ND(IMUL32rr)
+    CASE_ND(IMUL64rr)
   case X86::PANDrr:
   case X86::PORrr:
   case X86::PXORrr:
@@ -11263,8 +11274,8 @@ bool X86InstrInfo::getMachineCombinerPatterns(
     break;
   }
   }
-  return TargetInstrInfo::getMachineCombinerPatterns(Root,
-                                                     Patterns, DoRegPressureReduce);
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+                                                     DoRegPressureReduce);
 }
 
 static void
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 76f18803c2e3d..846bcc85b7ad6 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -319,8 +319,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
 
   Register isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
-  Register isLoadFromStackSlot(const MachineInstr &MI,
-                               int &FrameIndex,
+  Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex,
                                TypeSize &MemBytes) const override;
   /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
@@ -330,8 +329,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
 
   Register isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
-  Register isStoreToStackSlot(const MachineInstr &MI,
-                              int &FrameIndex,
+  Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex,
                               TypeSize &MemBytes) const override;
   /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
@@ -491,12 +489,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
   /// is likely that the referenced instruction has been changed.
   ///
   /// \returns true on success.
-  MachineInstr *
-  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
-                        ArrayRef<unsigned> Ops,
-                        MachineBasicBlock::iterator InsertPt, int FrameIndex,
-                        LiveIntervals *LIS = nullptr,
-                        VirtRegMap *VRM = nullptr) const override;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                                      ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      int FrameIndex,
+                                      LiveIntervals *LIS = nullptr,
+                                      VirtRegMap *VRM = nullptr) const override;
 
   /// Same as the previous version except it allows folding of any load and
   /// store from / to any address, not just from a specific stack slot.
@@ -745,8 +743,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
   ///
   /// If IsIntrinsic is set, operand 1 will be ignored for commuting.
   bool findThreeSrcCommutedOpIndices(const MachineInstr &MI,
-                                     unsigned &SrcOpIdx1,
-                                     unsigned &SrcOpIdx2,
+                                     unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
                                      bool IsIntrinsic = false) const;
 
   /// Returns true when instruction \p FlagI produces the same flags as \p OI.
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
index b88ec72a37925..4b5f31bad8313 100644
--- a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
+++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
@@ -209,94 +209,84 @@ define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind
 define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
 ; I386-NOCMOV-LABEL: test_ctselect_f80_basic:
 ; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    pushl %ebx
 ; I386-NOCMOV-NEXT:    pushl %edi
 ; I386-NOCMOV-NEXT:    pushl %esi
-; I386-NOCMOV-NEXT:    subl $12, %esp
-; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT:    sete %al
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    subl $40, %esp
+; I386-NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    testb $1, %cl
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT:    movb %al, %ah
-; I386-NOCMOV-NEXT:    movzbl %ah, %edi
-; I386-NOCMOV-NEXT:    negl %edi
-; I386-NOCMOV-NEXT:    movl %edx, %esi
-; I386-NOCMOV-NEXT:    andl %edi, %esi
-; I386-NOCMOV-NEXT:    notl %edi
-; I386-NOCMOV-NEXT:    andl %ecx, %edi
-; I386-NOCMOV-NEXT:    orl %edi, %esi
-; I386-NOCMOV-NEXT:    movl %esi, (%esp)
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT:    sete %ch
+; I386-NOCMOV-NEXT:    movb %ch, %al
+; I386-NOCMOV-NEXT:    movzbl %al, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %edi, %ebx
+; I386-NOCMOV-NEXT:    andl %ebp, %ebx
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %edx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %ebx
+; I386-NOCMOV-NEXT:    testb $1, %cl
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT:    movb %al, %ah
-; I386-NOCMOV-NEXT:    movzbl %ah, %edi
-; I386-NOCMOV-NEXT:    negl %edi
-; I386-NOCMOV-NEXT:    movl %edx, %esi
-; I386-NOCMOV-NEXT:    andl %edi, %esi
-; I386-NOCMOV-NEXT:    notl %edi
-; I386-NOCMOV-NEXT:    andl %ecx, %edi
-; I386-NOCMOV-NEXT:    orl %edi, %esi
-; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT:    movb %al, %ah
-; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    sete %ch
+; I386-NOCMOV-NEXT:    movb %ch, %cl
+; I386-NOCMOV-NEXT:    movzbl %cl, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %edx, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %eax, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; I386-NOCMOV-NEXT:    movb %al, %dl
+; I386-NOCMOV-NEXT:    movzbl %dl, %edi
 ; I386-NOCMOV-NEXT:    negl %edi
-; I386-NOCMOV-NEXT:    movl %edx, %esi
-; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, %ecx
+; I386-NOCMOV-NEXT:    andl %edi, %ecx
 ; I386-NOCMOV-NEXT:    notl %edi
-; I386-NOCMOV-NEXT:    andl %ecx, %edi
-; I386-NOCMOV-NEXT:    orl %edi, %esi
-; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT:    fldt (%esp)
-; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    andl %ebx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    addl $40, %esp
 ; I386-NOCMOV-NEXT:    popl %esi
 ; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    popl %ebp
 ; I386-NOCMOV-NEXT:    retl
 ;
 ; I386-CMOV-LABEL: test_ctselect_f80_basic:
 ; I386-CMOV:       # %bb.0:
-; I386-CMOV-NEXT:    pushl %edi
-; I386-CMOV-NEXT:    pushl %esi
-; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    subl $36, %esp
+; I386-CMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT:    movb %al, %ah
-; I386-CMOV-NEXT:    movzbl %ah, %edi
-; I386-CMOV-NEXT:    negl %edi
-; I386-CMOV-NEXT:    movl %edx, %esi
-; I386-CMOV-NEXT:    andl %edi, %esi
-; I386-CMOV-NEXT:    notl %edi
-; I386-CMOV-NEXT:    andl %ecx, %edi
-; I386-CMOV-NEXT:    orl %edi, %esi
-; I386-CMOV-NEXT:    movl %esi, (%esp)
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT:    movb %al, %ah
-; I386-CMOV-NEXT:    movzbl %ah, %edi
-; I386-CMOV-NEXT:    negl %edi
-; I386-CMOV-NEXT:    movl %edx, %esi
-; I386-CMOV-NEXT:    andl %edi, %esi
-; I386-CMOV-NEXT:    notl %edi
-; I386-CMOV-NEXT:    andl %ecx, %edi
-; I386-CMOV-NEXT:    orl %edi, %esi
-; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT:    movb %al, %ah
-; I386-CMOV-NEXT:    movzbl %ah, %edi
-; I386-CMOV-NEXT:    negl %edi
-; I386-CMOV-NEXT:    movl %edx, %esi
-; I386-CMOV-NEXT:    andl %edi, %esi
-; I386-CMOV-NEXT:    notl %edi
-; I386-CMOV-NEXT:    andl %ecx, %edi
-; I386-CMOV-NEXT:    orl %edi, %esi
-; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, (%esp)
 ; I386-CMOV-NEXT:    fldt (%esp)
-; I386-CMOV-NEXT:    addl $12, %esp
-; I386-CMOV-NEXT:    popl %esi
-; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    addl $36, %esp
 ; I386-CMOV-NEXT:    retl
   %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
   ret x86_fp80 %result
@@ -543,94 +533,84 @@ define float @test_ctselect_f32_nan(i1 %cond) nounwind {
 define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
 ; I386-NOCMOV-LABEL: test_ctselect_f80_alignment:
 ; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    pushl %ebx
 ; I386-NOCMOV-NEXT:    pushl %edi
 ; I386-NOCMOV-NEXT:    pushl %esi
-; I386-NOCMOV-NEXT:    subl $12, %esp
-; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT:    sete %al
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    subl $40, %esp
+; I386-NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    testb $1, %cl
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT:    movb %al, %ah
-; I386-NOCMOV-NEXT:    movzbl %ah, %edi
-; I386-NOCMOV-NEXT:    negl %edi
-; I386-NOCMOV-NEXT:    movl %edx, %esi
-; I386-NOCMOV-NEXT:    andl %edi, %esi
-; I386-NOCMOV-NEXT:    notl %edi
-; I386-NOCMOV-NEXT:    andl %ecx, %edi
-; I386-NOCMOV-NEXT:    orl %edi, %esi
-; I386-NOCMOV-NEXT:    movl %esi, (%esp)
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT:    sete %ch
+; I386-NOCMOV-NEXT:    movb %ch, %al
+; I386-NOCMOV-NEXT:    movzbl %al, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %edi, %ebx
+; I386-NOCMOV-NEXT:    andl %ebp, %ebx
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %edx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %ebx
+; I386-NOCMOV-NEXT:    testb $1, %cl
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT:    movb %al, %ah
-; I386-NOCMOV-NEXT:    movzbl %ah, %edi
-; I386-NOCMOV-NEXT:    negl %edi
-; I386-NOCMOV-NEXT:    movl %edx, %esi
-; I386-NOCMOV-NEXT:    andl %edi, %esi
-; I386-NOCMOV-NEXT:    notl %edi
-; I386-NOCMOV-NEXT:    andl %ecx, %edi
-; I386-NOCMOV-NEXT:    orl %edi, %esi
-; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT:    movb %al, %ah
-; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    sete %ch
+; I386-NOCMOV-NEXT:    movb %ch, %cl
+; I386-NOCMOV-NEXT:    movzbl %cl, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %edx, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %eax, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; I386-NOCMOV-NEXT:    movb %al, %dl
+; I386-NOCMOV-NEXT:    movzbl %dl, %edi
 ; I386-NOCMOV-NEXT:    negl %edi
-; I386-NOCMOV-NEXT:    movl %edx, %esi
-; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, %ecx
+; I386-NOCMOV-NEXT:    andl %edi, %ecx
 ; I386-NOCMOV-NEXT:    notl %edi
-; I386-NOCMOV-NEXT:    andl %ecx, %edi
-; I386-NOCMOV-NEXT:    orl %edi, %esi
-; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT:    fldt (%esp)
-; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    andl %ebx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    addl $40, %esp
 ; I386-NOCMOV-NEXT:    popl %esi
 ; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    popl %ebp
 ; I386-NOCMOV-NEXT:    retl
 ;
 ; I386-CMOV-LABEL: test_ctselect_f80_alignment:
 ; I386-CMOV:       # %bb.0:
-; I386-CMOV-NEXT:    pushl %edi
-; I386-CMOV-NEXT:    pushl %esi
-; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    subl $36, %esp
+; I386-CMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT:    movb %al, %ah
-; I386-CMOV-NEXT:    movzbl %ah, %edi
-; I386-CMOV-NEXT:    negl %edi
-; I386-CMOV-NEXT:    movl %edx, %esi
-; I386-CMOV-NEXT:    andl %edi, %esi
-; I386-CMOV-NEXT:    notl %edi
-; I386-CMOV-NEXT:    andl %ecx, %edi
-; I386-CMOV-NEXT:    orl %edi, %esi
-; I386-CMOV-NEXT:    movl %esi, (%esp)
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT:    movb %al, %ah
-; I386-CMOV-NEXT:    movzbl %ah, %edi
-; I386-CMOV-NEXT:    negl %edi
-; I386-CMOV-NEXT:    movl %edx, %esi
-; I386-CMOV-NEXT:    andl %edi, %esi
-; I386-CMOV-NEXT:    notl %edi
-; I386-CMOV-NEXT:    andl %ecx, %edi
-; I386-CMOV-NEXT:    orl %edi, %esi
-; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT:    movb %al, %ah
-; I386-CMOV-NEXT:    movzbl %ah, %edi
-; I386-CMOV-NEXT:    negl %edi
-; I386-CMOV-NEXT:    movl %edx, %esi
-; I386-CMOV-NEXT:    andl %edi, %esi
-; I386-CMOV-NEXT:    notl %edi
-; I386-CMOV-NEXT:    andl %ecx, %edi
-; I386-CMOV-NEXT:    orl %edi, %esi
-; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, (%esp)
 ; I386-CMOV-NEXT:    fldt (%esp)
-; I386-CMOV-NEXT:    addl $12, %esp
-; I386-CMOV-NEXT:    popl %esi
-; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    addl $36, %esp
 ; I386-CMOV-NEXT:    retl
   %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
   ret x86_fp80 %result