[llvm-branch-commits] [llvm] [ConstantTime] Native ct.select support for X86 and i386 (PR #166704)
Julius Alexandre via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Nov 6 09:28:57 PST 2025
https://github.com/wizardengineer updated https://github.com/llvm/llvm-project/pull/166704
>From cee41562976955a1e4c7b911a304b989a73be16d Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 17:09:23 -0500
Subject: [PATCH 1/2] [LLVM][X86] Add native ct.select support for X86 and i386
Add native X86 implementation with CMOV instructions and comprehensive tests:
- X86 ISelLowering with CMOV for x86_64 and i386
- Fallback bitwise operations for i386 targets without CMOV
- Post-RA expansion for pseudo-instructions
- Comprehensive test coverage:
- Edge cases (zero conditions, large integers)
- i386-specific tests (FP, MMX, non-CMOV fallback)
- Vector operations
- Optimization patterns
The basic test demonstrating fallback is in the core infrastructure PR.
---
llvm/lib/Target/X86/X86.td | 8 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 791 +++++++++-
llvm/lib/Target/X86/X86ISelLowering.h | 7 +
llvm/lib/Target/X86/X86InstrCMovSetCC.td | 205 +++
llvm/lib/Target/X86/X86InstrCompiler.td | 81 ++
llvm/lib/Target/X86/X86InstrFragments.td | 5 +
llvm/lib/Target/X86/X86InstrInfo.cpp | 609 +++++++-
llvm/lib/Target/X86/X86InstrInfo.h | 6 +
llvm/lib/Target/X86/X86InstrPredicates.td | 5 +
llvm/lib/Target/X86/X86TargetMachine.cpp | 5 +-
llvm/test/CodeGen/X86/ctselect-edge-cases.ll | 409 ++++++
llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 722 ++++++++++
llvm/test/CodeGen/X86/ctselect-i386-mmx.ll | 428 ++++++
llvm/test/CodeGen/X86/ctselect-i386.ll | 267 ++++
.../test/CodeGen/X86/ctselect-optimization.ll | 304 ++++
llvm/test/CodeGen/X86/ctselect-vector.ll | 1274 +++++++++++++++++
llvm/test/CodeGen/X86/ctselect.ll | 996 +++++++------
17 files changed, 5671 insertions(+), 451 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/ctselect-edge-cases.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-fp.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-i386.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-optimization.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-vector.ll
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 9e291a6ae431f..21826d8289bb9 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -825,9 +825,10 @@ include "X86SchedSapphireRapids.td"
def ProcessorFeatures {
// x86-64 micro-architecture levels: x86-64 and x86-64-v[234]
- list<SubtargetFeature> X86_64V1Features = [
- FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2,
- FeatureFXSR, FeatureNOPL, FeatureX86_64,
+ list<SubtargetFeature> X86_64V1Features = [FeatureX87, FeatureCX8,
+ FeatureCMOV, FeatureMMX,
+ FeatureSSE2, FeatureFXSR,
+ FeatureNOPL, FeatureX86_64,
];
list<SubtargetFeature> X86_64V1Tuning = [
TuningMacroFusion,
@@ -1161,6 +1162,7 @@ def ProcessorFeatures {
FeatureAVXNECONVERT,
FeatureAVXVNNIINT8,
FeatureAVXVNNIINT16,
+ FeatureUSERMSR,
FeatureSHA512,
FeatureSM3,
FeatureEGPR,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6edf0185df813..833afa717c32c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "X86ISelLowering.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
#include "MCTargetDesc/X86ShuffleDecode.h"
#include "X86.h"
#include "X86FrameLowering.h"
@@ -29,6 +30,8 @@
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -48,6 +51,7 @@
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"
@@ -488,6 +492,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// X86 wants to expand cmov itself.
for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
@@ -496,11 +501,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
}
// Custom action for SELECT MMX and expand action for SELECT_CC MMX
setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::x86mmx, Custom);
setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
@@ -630,6 +637,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BR_CC, VT, Action);
setOperationAction(ISD::SETCC, VT, Action);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::SELECT_CC, VT, Action);
setOperationAction(ISD::FROUND, VT, Action);
setOperationAction(ISD::FROUNDEVEN, VT, Action);
@@ -1067,6 +1075,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom);
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
@@ -1220,6 +1229,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v16i8, Custom);
+
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
@@ -1541,6 +1557,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v4f64, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v16f16, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v32i8, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v8f32, Custom);
+
for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
@@ -1727,6 +1751,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v1i1, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
@@ -1772,6 +1797,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
@@ -2038,6 +2064,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -2203,6 +2230,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
@@ -2269,6 +2297,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, VT, Legal);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
@@ -2538,6 +2567,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::x86amx, &X86::TILERegClass);
}
+ // Handle 512-bit vector CTSELECT without AVX512 by setting them to Expand
+ // This allows type legalization to split them into smaller vectors
+ for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16,
+ MVT::v16f32, MVT::v8f64}) {
+ setOperationAction(ISD::CTSELECT, VT, Expand);
+ }
+
+ // Handle 256-bit vector CTSELECT without AVX by setting them to Expand
+ // This allows type legalization to split them into 128-bit vectors
+ if (!Subtarget.hasAVX()) {
+ for (auto VT : {MVT::v4f64, MVT::v4i64, MVT::v8i32, MVT::v16i16,
+ MVT::v16f16, MVT::v32i8, MVT::v8f32}) {
+ setOperationAction(ISD::CTSELECT, VT, Expand);
+ }
+ }
+
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -2644,6 +2689,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::BITCAST,
ISD::VSELECT,
ISD::SELECT,
+ ISD::CTSELECT,
ISD::SHL,
ISD::SRA,
ISD::SRL,
@@ -25325,6 +25371,174 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,
return V;
}
+SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Cond = Op.getOperand(0); // condition
+ SDValue TrueOp = Op.getOperand(1); // true_value
+ SDValue FalseOp = Op.getOperand(2); // false_value
+ SDLoc DL(Op);
+ MVT VT = TrueOp.getSimpleValueType();
+
+ // Special handling for i386 targets (no CMOV) - route to post-RA expansion
+ // pseudos Let standard type legalization handle i64 automatically (splits
+ // into EDX:EAX)
+
+ // Handle soft float16 by converting to integer operations
+ if (isSoftF16(VT, Subtarget)) {
+ MVT NVT = VT.changeTypeToInteger();
+ SDValue CtSelect =
+ DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, DAG.getBitcast(NVT, FalseOp),
+ DAG.getBitcast(NVT, TrueOp));
+ return DAG.getBitcast(VT, CtSelect);
+ }
+
+ // Handle vector types
+ if (VT.isVector()) {
+ // Handle soft float16 vectors
+ if (isSoftF16(VT, Subtarget)) {
+ MVT NVT = VT.changeVectorElementTypeToInteger();
+ SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, NVT, Cond,
+ DAG.getBitcast(NVT, FalseOp),
+ DAG.getBitcast(NVT, TrueOp));
+ return DAG.getBitcast(VT, CtSelect);
+ }
+
+ unsigned VectorWidth = VT.getSizeInBits();
+ MVT EltVT = VT.getVectorElementType();
+
+ // 512-bit vectors without AVX512 are now handled by type legalization
+ // (Expand action) 256-bit vectors without AVX are now handled by type
+ // legalization (Expand action)
+
+ if (VectorWidth == 128 && !Subtarget.hasSSE1())
+ return SDValue();
+
+ // Handle special cases for floating point vectors
+ if (EltVT.isFloatingPoint()) {
+ // For vector floating point with AVX, use VBLENDV-style operations
+ if (Subtarget.hasAVX() && (VectorWidth == 256 || VectorWidth == 128)) {
+ // Convert to bitwise operations using the condition
+ MVT IntVT = VT.changeVectorElementTypeToInteger();
+ SDValue IntOp1 = DAG.getBitcast(IntVT, TrueOp);
+ SDValue IntOp2 = DAG.getBitcast(IntVT, FalseOp);
+
+ // Create the CTSELECT node with integer types
+ SDValue IntResult =
+ DAG.getNode(X86ISD::CTSELECT, DL, IntVT, IntOp2, IntOp1,
+ DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8),
+ EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget));
+ return DAG.getBitcast(VT, IntResult);
+ }
+ }
+
+ // For integer vectors or when we don't have advanced SIMD support,
+ // use the generic X86 CTSELECT node which will be matched by the patterns
+ SDValue CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+ SDValue EFLAGS = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+ // Create the X86 CTSELECT node - note operand order: true, false, cc, flags
+ return DAG.getNode(X86ISD::CTSELECT, DL, VT, FalseOp, TrueOp, CC, EFLAGS);
+ }
+
+ // Look past (and (setcc_carry (cmp ...)), 1)
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+ isOneConstant(Cond.getOperand(1)))
+ Cond = Cond.getOperand(0);
+
+ /// Process condition flags and prepare for CTSELECT node creation
+ auto ProcessConditionFlags =
+ [&](SDValue Cond, MVT VT, SDLoc DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) -> std::pair<SDValue, SDValue> {
+ SDValue CC;
+ bool AddTest = true;
+
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+ SDValue Cmp = Cond.getOperand(1);
+
+ if ((isX86LogicalCmp(Cmp)) || Cmp.getOpcode() == X86ISD::BT) {
+ Cond = Cmp;
+ AddTest = false;
+ }
+ } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
+ SDValue Value;
+ X86::CondCode X86Cond;
+ std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
+ CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
+ AddTest = false;
+ }
+
+ if (AddTest) {
+ // Look past the truncate if the high bits are known zero
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ // Try to match AND to BT instruction
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ X86::CondCode X86CondCode;
+ if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
+ CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
+ Cond = BT;
+ AddTest = false;
+ }
+ }
+ }
+
+ if (AddTest) {
+ CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+ Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+ }
+
+ return {CC, Cond};
+ };
+
+ // Process condition flags and prepare for CTSELECT
+ auto [CC, ProcessedCond] =
+ ProcessConditionFlags(Cond, VT, DL, DAG, Subtarget);
+
+ // Handle i8 CTSELECT with truncate optimization
+ if (Op.getValueType() == MVT::i8 && TrueOp.getOpcode() == ISD::TRUNCATE &&
+ FalseOp.getOpcode() == ISD::TRUNCATE) {
+ SDValue T1 = TrueOp.getOperand(0), T2 = FalseOp.getOperand(0);
+ if (T1.getValueType() == T2.getValueType() &&
+ T1.getOpcode() != ISD::CopyFromReg &&
+ T2.getOpcode() != ISD::CopyFromReg) {
+ SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(),
+ T2, T1, CC, ProcessedCond);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect);
+ }
+ }
+
+ // Promote small integer types to avoid partial register stalls
+ // Exception: For i8 without CMOV, we can generate a shorter instruction
+ // sequence without movzx so keep it as is.
+ if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMOV()) ||
+ (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(TrueOp, Subtarget) &&
+ !X86::mayFoldLoad(FalseOp, Subtarget))) {
+ TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp);
+ FalseOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseOp);
+ SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+ SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect);
+ }
+
+ if (isScalarFPTypeInSSEReg(VT)) {
+ MVT IntVT = (VT == MVT::f32) ? MVT::i32 : MVT::i64;
+ TrueOp = DAG.getBitcast(IntVT, TrueOp);
+ FalseOp = DAG.getBitcast(IntVT, FalseOp);
+ SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+ SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, IntVT, Ops);
+ return DAG.getBitcast(VT, CtSelect);
+ }
+
+ // Create final CTSELECT node
+ SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+ return DAG.getNode(X86ISD::CTSELECT, DL, Op.getValueType(), Ops,
+ Op->getFlags());
+}
+
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = Op->getOperand(0);
@@ -29695,30 +29909,65 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
const X86Subtarget &Subtarget,
SelectionDAG &DAG,
SDValue *Low = nullptr) {
+ unsigned NumElts = VT.getVectorNumElements();
+
// For vXi8 we will unpack the low and high half of each 128 bit lane to widen
// to a vXi16 type. Do the multiplies, shift the results and pack the half
// lane results back together.
// We'll take different approaches for signed and unsigned.
- // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
- // words and use pmullw to calculate the full 16-bit product.
+ // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
+ // and use pmullw to calculate the full 16-bit product.
// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
// shift them left into the upper byte of each word. This allows us to use
// pmulhw to calculate the full 16-bit product. This trick means we don't
// need to sign extend the bytes to use pmullw.
- MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+
+ MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
SDValue Zero = DAG.getConstant(0, dl, VT);
- SDValue ALo, AHi, BLo, BHi;
+ SDValue ALo, AHi;
if (IsSigned) {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
- BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
- BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
} else {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
- BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
+ }
+
+ SDValue BLo, BHi;
+ if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
+ // If the RHS is a constant, manually unpackl/unpackh and extend.
+ SmallVector<SDValue, 16> LoOps, HiOps;
+ for (unsigned i = 0; i != NumElts; i += 16) {
+ for (unsigned j = 0; j != 8; ++j) {
+ SDValue LoOp = B.getOperand(i + j);
+ SDValue HiOp = B.getOperand(i + j + 8);
+
+ if (IsSigned) {
+ LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
+ HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
+ LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
+ DAG.getConstant(8, dl, MVT::i16));
+ HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
+ DAG.getConstant(8, dl, MVT::i16));
+ } else {
+ LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
+ HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
+ }
+
+ LoOps.push_back(LoOp);
+ HiOps.push_back(HiOp);
+ }
+ }
+
+ BLo = DAG.getBuildVector(ExVT, dl, LoOps);
+ BHi = DAG.getBuildVector(ExVT, dl, HiOps);
+ } else if (IsSigned) {
+ BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
+ BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
+ } else {
+ BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
}
@@ -29731,7 +29980,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
if (Low)
*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
- return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
+ return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
}
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
@@ -33594,6 +33843,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
+ case ISD::CTSELECT: return LowerCTSELECT(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG);
@@ -33677,6 +33927,12 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
}
}
+bool X86TargetLowering::isSelectSupported(SelectSupportKind Kind) const {
+ if (Kind == SelectSupportKind::CtSelect) {
+ return true;
+ }
+ return TargetLoweringBase::isSelectSupported(Kind);
+}
/// Replace a node with an illegal result type with a new node built out of
/// custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
@@ -34904,6 +35160,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(STRICT_CMPM)
NODE_NAME_CASE(CMPMM_SAE)
NODE_NAME_CASE(SETCC)
+ NODE_NAME_CASE(CTSELECT)
NODE_NAME_CASE(SETCC_CARRY)
NODE_NAME_CASE(FSETCC)
NODE_NAME_CASE(FSETCCM)
@@ -37677,6 +37934,480 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
return BB;
}
+/// Helper function to emit i386 CTSELECT with condition materialization.
+/// This converts EFLAGS-based CTSELECT into a condition byte that can be
+/// shared across multiple operations (critical for i64 type legalization).
+///
+/// Phase 1: Materialize condition byte from EFLAGS using SETCC
+/// Phase 2: Create internal pseudo with condition byte for post-RA expansion
+///
+/// This approach ensures that when i64 is type-legalized into two i32
+/// operations, both operations share the same condition byte rather than
+/// each independently reading (and destroying) EFLAGS.
+static MachineBasicBlock *
+emitCTSelectI386WithConditionMaterialization(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned InternalPseudoOpcode) {
+ const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+ const MIMetadata MIMD(MI);
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Original pseudo operands: (outs dst), (ins src1, src2, cond)
+ Register Src1Reg = MI.getOperand(1).getReg();
+ Register Src2Reg = MI.getOperand(2).getReg();
+ X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(3).getImm());
+
+ // Get opposite condition (SETCC sets to 1 when condition is TRUE,
+ // but we want to select src1 when condition is FALSE for X86 semantics)
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+ // Step 1: Materialize condition byte from EFLAGS
+ // This is done OUTSIDE the constant-time bundle, before any EFLAGS corruption
+ Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC);
+
+ // Step 2: Create internal pseudo that takes condition byte as input
+ // This pseudo will be expanded post-RA into the actual constant-time bundle
+ // The condition byte can now be safely shared between multiple pseudos
+
+ // Internal pseudo has operands: (outs dst, tmp_byte, tmp_mask), (ins src1,
+ // src2, cond_byte)
+ Register DstReg = MI.getOperand(0).getReg();
+
+ // Create virtual registers for the temporary outputs
+ Register TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ Register TmpMaskReg;
+
+ // Determine the register class for tmp_mask based on the data type
+ if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR8rr) {
+ TmpMaskReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR16rr) {
+ TmpMaskReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+ } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR32rr) {
+ TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ } else {
+ llvm_unreachable("Unknown internal pseudo opcode");
+ }
+
+ BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode))
+ .addDef(DstReg) // dst (output)
+ .addDef(TmpByteReg) // tmp_byte (output)
+ .addDef(TmpMaskReg) // tmp_mask (output)
+ .addReg(Src1Reg) // src1 (input)
+ .addReg(Src2Reg) // src2 (input)
+ .addReg(CondByteReg); // pre-materialized condition byte (input)
+
+ MI.eraseFromParent();
+ return BB;
+}
+
+// Helper structure to hold memory operand information for FP loads
+struct FPLoadMemOperands {
+ bool IsValid = false;
+ unsigned BaseReg = 0;
+ int64_t ScaleVal = 1;
+ unsigned IndexReg = 0;
+ int64_t Disp = 0;
+ unsigned SegReg = 0;
+ int FrameIndex = -1;
+ bool IsFrameIndex = false;
+ int ConstantPoolIndex = -1;
+ bool IsConstantPool = false;
+ const GlobalValue *Global = nullptr;
+ int64_t GlobalOffset = 0;
+ bool IsGlobal = false;
+};
+
+// Check if a virtual register is defined by a simple FP load instruction
+// Returns the memory operands if it's a simple load, otherwise returns invalid
+static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
+ MachineRegisterInfo &MRI,
+ unsigned ExpectedLoadOpcode) {
+ FPLoadMemOperands Result;
+
+ if (!Reg.isVirtual())
+ return Result;
+
+ MachineInstr *DefMI = MRI.getVRegDef(Reg);
+ if (!DefMI)
+ return Result;
+
+ // Check if it's the expected load opcode (e.g., LD_Fp32m, LD_Fp64m, LD_Fp80m)
+ if (DefMI->getOpcode() != ExpectedLoadOpcode)
+ return Result;
+
+ // Check that this is a simple load - not volatile, not atomic, etc.
+ // FP loads have hasSideEffects = 0 in their definition for simple loads
+ if (DefMI->hasOrderedMemoryRef())
+ return Result;
+
+ // The load should have a single def (the destination register) and memory operands
+ // Format: %reg = LD_Fpxxm <fi#N>, 1, %noreg, 0, %noreg
+ // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment
+ if (DefMI->getNumOperands() < 6)
+ return Result;
+
+ // Operand 0 is the destination, operands 1-5 are the memory reference
+ MachineOperand &BaseMO = DefMI->getOperand(1);
+ MachineOperand &ScaleMO = DefMI->getOperand(2);
+ MachineOperand &IndexMO = DefMI->getOperand(3);
+ MachineOperand &DispMO = DefMI->getOperand(4);
+ MachineOperand &SegMO = DefMI->getOperand(5);
+
+ // Check if this is a frame index load
+ if (BaseMO.isFI()) {
+ Result.IsValid = true;
+ Result.IsFrameIndex = true;
+ Result.FrameIndex = BaseMO.getIndex();
+ Result.ScaleVal = ScaleMO.getImm();
+ Result.IndexReg = IndexMO.getReg();
+ Result.Disp = DispMO.getImm();
+ Result.SegReg = SegMO.getReg();
+ return Result;
+ }
+
+ // Check if this is a constant pool load
+ // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg
+ if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
+ ScaleMO.isImm() && IndexMO.isReg() &&
+ IndexMO.getReg() == X86::NoRegister &&
+ DispMO.isCPI() && SegMO.isReg()) {
+ Result.IsValid = true;
+ Result.IsConstantPool = true;
+ Result.ConstantPoolIndex = DispMO.getIndex();
+ Result.ScaleVal = ScaleMO.getImm();
+ Result.IndexReg = IndexMO.getReg();
+ Result.Disp = 0;
+ Result.SegReg = SegMO.getReg();
+ return Result;
+ }
+
+ // Check if this is a global variable load
+ // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg
+ if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
+ ScaleMO.isImm() && IndexMO.isReg() &&
+ IndexMO.getReg() == X86::NoRegister &&
+ DispMO.isGlobal() && SegMO.isReg()) {
+ Result.IsValid = true;
+ Result.IsGlobal = true;
+ Result.Global = DispMO.getGlobal();
+ Result.GlobalOffset = DispMO.getOffset();
+ Result.ScaleVal = ScaleMO.getImm();
+ Result.IndexReg = IndexMO.getReg();
+ Result.Disp = 0;
+ Result.SegReg = SegMO.getReg();
+ return Result;
+ }
+
+ // Regular memory operands (e.g., pointer loads)
+ if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() &&
+ DispMO.isImm() && SegMO.isReg()) {
+ Result.IsValid = true;
+ Result.IsFrameIndex = false;
+ Result.IsConstantPool = false;
+ Result.BaseReg = BaseMO.getReg();
+ Result.ScaleVal = ScaleMO.getImm();
+ Result.IndexReg = IndexMO.getReg();
+ Result.Disp = DispMO.getImm();
+ Result.SegReg = SegMO.getReg();
+ return Result;
+ }
+
+ return Result;
+}
+
+static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned pseudoInstr) {
+ const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+ const MIMetadata MIMD(MI);
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ unsigned RegSizeInByte = 4;
+
+ // Get operands
+ // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80, %cond:i8imm
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned FalseReg = MI.getOperand(1).getReg();
+ unsigned TrueReg = MI.getOperand(2).getReg();
+ X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(3).getImm());
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+ // Materialize condition byte from EFLAGS
+ Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC);
+
+ auto storeFpToSlot = [&](unsigned Opcode, int Slot, Register Reg) {
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(Opcode)), Slot)
+ .addReg(Reg, RegState::Kill);
+ };
+
+ // Helper to load integer from memory operands
+ auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps,
+ unsigned Offset) -> unsigned {
+ unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg);
+
+ if (MemOps.IsFrameIndex) {
+ // Frame index: addFrameIndex + scale + index + disp + segment
+ MIB.addFrameIndex(MemOps.FrameIndex)
+ .addImm(MemOps.ScaleVal)
+ .addReg(MemOps.IndexReg)
+ .addImm(MemOps.Disp + Offset)
+ .addReg(MemOps.SegReg);
+ } else if (MemOps.IsConstantPool) {
+ // Constant pool: base_reg + scale + index + CP_index + segment
+ // MOV32rm format: base, scale, index, displacement, segment
+ MIB.addReg(X86::NoRegister) // Base register
+ .addImm(MemOps.ScaleVal) // Scale
+ .addReg(MemOps.IndexReg) // Index register
+ .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset) // Displacement (CP index)
+ .addReg(MemOps.SegReg); // Segment
+ } else if (MemOps.IsGlobal) {
+ // Global variable: base_reg + scale + index + global + segment
+ // MOV32rm format: base, scale, index, displacement, segment
+ MIB.addReg(X86::NoRegister) // Base register
+ .addImm(MemOps.ScaleVal) // Scale
+ .addReg(MemOps.IndexReg) // Index register
+ .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset) // Displacement (global address)
+ .addReg(MemOps.SegReg); // Segment
+ } else {
+ // Regular memory: base_reg + scale + index + disp + segment
+ MIB.addReg(MemOps.BaseReg)
+ .addImm(MemOps.ScaleVal)
+ .addReg(MemOps.IndexReg)
+ .addImm(MemOps.Disp + Offset)
+ .addReg(MemOps.SegReg);
+ }
+
+ return IntReg;
+ };
+
+ // Optimized path: load integers directly from memory when both operands are
+ // memory loads, avoiding FP register round-trip
+ auto emitCtSelectFromMemory = [&](unsigned NumValues,
+ const FPLoadMemOperands &TrueMemOps,
+ const FPLoadMemOperands &FalseMemOps,
+ int ResultSlot) {
+ for (unsigned Val = 0; Val < NumValues; ++Val) {
+ unsigned Offset = Val * RegSizeInByte;
+
+ // Load true and false values directly from their memory locations as integers
+ unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset);
+ unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset);
+
+ // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+ unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+
+ BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
+ .addDef(ResultIntReg) // dst (output)
+ .addDef(TmpByteReg) // tmp_byte (output)
+ .addDef(TmpMaskReg) // tmp_mask (output)
+ .addReg(FalseIntReg) // src1 (input) - false value
+ .addReg(TrueIntReg) // src2 (input) - true value
+ .addReg(CondByteReg); // pre-materialized condition byte (input)
+
+ // Store result back to result slot
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+ .addFrameIndex(ResultSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0)
+ .addReg(ResultIntReg, RegState::Kill);
+ }
+ };
+
+ auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) {
+ for (unsigned Val = 0; Val < NumValues; ++Val) {
+ unsigned Offset = Val * RegSizeInByte;
+
+ // Load true and false values from stack as 32-bit integers
+ unsigned TrueIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueIntReg)
+ .addFrameIndex(TrueSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0);
+
+ unsigned FalseIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), FalseIntReg)
+ .addFrameIndex(FalseSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0);
+
+ // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+ unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+
+ BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
+ .addDef(ResultIntReg) // dst (output)
+ .addDef(TmpByteReg) // tmp_byte (output)
+ .addDef(TmpMaskReg) // tmp_mask (output)
+ .addReg(FalseIntReg) // src1 (input) - false value
+ .addReg(TrueIntReg) // src2 (input) - true value
+ .addReg(CondByteReg); // pre-materialized condition byte (input)
+
+ // Store result back to result slot
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+ .addFrameIndex(ResultSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0)
+ .addReg(ResultIntReg, RegState::Kill);
+ }
+ };
+
+ switch (pseudoInstr) {
+ case X86::CTSELECT_I386_FP32rr: {
+ // Check if both operands are simple memory loads
+ FPLoadMemOperands TrueMemOps =
+ getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp32m);
+ FPLoadMemOperands FalseMemOps =
+ getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp32m);
+
+ int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+
+ if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+ // Optimized path: load directly from memory as integers
+ // Works for both frame index loads (stack parameters) and
+ // constant pool loads (constants)
+ emitCtSelectFromMemory(1, TrueMemOps, FalseMemOps, ResultSlot);
+
+ // Erase the original FP load instructions since we're not using them
+ // and have loaded the data directly as integers instead
+ if (MRI.hasOneUse(TrueReg)) {
+ if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+ TrueDefMI->eraseFromParent();
+ }
+ if (MRI.hasOneUse(FalseReg)) {
+ if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+ FalseDefMI->eraseFromParent();
+ }
+ } else {
+ // General path: spill FP registers to stack first
+ int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+ int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+
+ storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg);
+ storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg);
+
+ emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot);
+ }
+
+ // Load result back as f32
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg),
+ ResultSlot);
+ break;
+ }
+ case X86::CTSELECT_I386_FP64rr: {
+ unsigned StackSlotSize = 8;
+
+ // Check if both operands are simple memory loads
+ FPLoadMemOperands TrueMemOps =
+ getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp64m);
+ FPLoadMemOperands FalseMemOps =
+ getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp64m);
+
+ int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+
+ if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+ // Optimized path: load directly from memory as integers
+ // Works for both frame index loads (stack parameters) and
+ // constant pool loads (constants)
+ emitCtSelectFromMemory(StackSlotSize / RegSizeInByte, TrueMemOps,
+ FalseMemOps, ResultSlot);
+
+ // Erase the original FP load instructions since we're not using them
+ if (MRI.hasOneUse(TrueReg)) {
+ if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+ TrueDefMI->eraseFromParent();
+ }
+ if (MRI.hasOneUse(FalseReg)) {
+ if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+ FalseDefMI->eraseFromParent();
+ }
+ } else {
+ // General path: spill FP registers to stack first
+ int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+ int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+
+ storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg);
+ storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg);
+
+ emitCtSelectWithPseudo(StackSlotSize / RegSizeInByte, TrueSlot, FalseSlot,
+ ResultSlot);
+ }
+
+ // Load result back as f64
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg),
+ ResultSlot);
+ break;
+ }
+ case X86::CTSELECT_I386_FP80rr: {
+ // f80 is 80 bits (10 bytes), but stored with 12-byte alignment
+ unsigned StackObjectSize = 12;
+
+ // Check if both operands are simple memory loads
+ FPLoadMemOperands TrueMemOps =
+ getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp80m);
+ FPLoadMemOperands FalseMemOps =
+ getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp80m);
+
+ int ResultSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+
+ if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+ // Optimized path: load directly from memory as integers
+ // Works for both frame index loads (stack parameters) and
+ // constant pool loads (constants)
+ emitCtSelectFromMemory(StackObjectSize / RegSizeInByte, TrueMemOps,
+ FalseMemOps, ResultSlot);
+
+ // Erase the original FP load instructions since we're not using them
+ if (MRI.hasOneUse(TrueReg)) {
+ if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+ TrueDefMI->eraseFromParent();
+ }
+ if (MRI.hasOneUse(FalseReg)) {
+ if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+ FalseDefMI->eraseFromParent();
+ }
+ } else {
+ // General path: spill FP registers to stack first
+ int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+ int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+
+ storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg);
+ storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg);
+
+ emitCtSelectWithPseudo(StackObjectSize / RegSizeInByte, TrueSlot,
+ FalseSlot, ResultSlot);
+ }
+
+ // Load result back as f80
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg),
+ ResultSlot);
+ break;
+ }
+ default:
+ llvm_unreachable("Invalid CTSELECT opcode");
+ }
+
+ MI.eraseFromParent();
+
+ return BB;
+}
+
MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
@@ -37734,6 +38465,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::CMOV_VK64:
return EmitLoweredSelect(MI, BB);
+ case X86::CTSELECT_I386_GR8rr:
+ return emitCTSelectI386WithConditionMaterialization(
+ MI, BB, X86::CTSELECT_I386_INT_GR8rr);
+
+ case X86::CTSELECT_I386_GR16rr:
+ return emitCTSelectI386WithConditionMaterialization(
+ MI, BB, X86::CTSELECT_I386_INT_GR16rr);
+
+ case X86::CTSELECT_I386_GR32rr:
+ return emitCTSelectI386WithConditionMaterialization(
+ MI, BB, X86::CTSELECT_I386_INT_GR32rr);
+
+ case X86::CTSELECT_I386_FP32rr:
+ return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP32rr);
+ case X86::CTSELECT_I386_FP64rr:
+ return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP64rr);
+ case X86::CTSELECT_I386_FP80rr:
+ return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP80rr);
+
case X86::FP80_ADDr:
case X86::FP80_ADDm32: {
// Change the floating point control register to use double extended
@@ -41695,7 +42445,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
return SDValue();
- Imm = llvm::rotl<uint8_t>(Imm, 4);
+ Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
DAG.getTargetConstant(Imm, DL, MVT::i8));
};
@@ -44662,16 +45412,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
- if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) {
- // iff we only need the signbit then we can use R directly.
- if (OriginalDemandedBits.isSignMask())
- return TLO.CombineTo(Op, Op.getOperand(1));
- // otherwise we just need R's signbit for the comparison.
- APInt SignMask = APInt::getSignMask(BitWidth);
- if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts,
- Known, TLO, Depth + 1))
- return true;
- }
+ // iff we only need the sign bit then we can use R directly.
+ if (OriginalDemandedBits.isSignMask() &&
+ ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
+ return TLO.CombineTo(Op, Op.getOperand(1));
break;
case X86ISD::MOVMSK: {
SDValue Src = Op.getOperand(0);
@@ -47581,15 +48325,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
DL, DAG, Subtarget))
return V;
- // If the sign bit is known then BLENDV can be folded away.
- if (N->getOpcode() == X86ISD::BLENDV) {
- KnownBits KnownCond = DAG.computeKnownBits(Cond);
- if (KnownCond.isNegative())
- return LHS;
- if (KnownCond.isNonNegative())
- return RHS;
- }
-
if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
SmallVector<int, 64> CondMask;
if (createShuffleMaskFromVSELECT(CondMask, Cond,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..d759895719388 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -114,6 +114,10 @@ namespace llvm {
/// X86 Select
SELECTS,
+ /// X86 Constant-time Select, implemented with CMOV instruction. This is
+ /// used to implement constant-time select.
+ CTSELECT,
+
// Same as SETCC except it's materialized with a sbb and the value is all
// one's or all zero's.
SETCC_CARRY, // R = carry_bit ? ~0 : 0
@@ -1139,6 +1143,8 @@ namespace llvm {
///
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ bool isSelectSupported(SelectSupportKind Kind) const override;
+
/// Replace the results of node with an illegal result
/// type with new values built out of custom code.
///
@@ -1765,6 +1771,7 @@ namespace llvm {
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 7d5d7cf4a83ab..9c34889f03354 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -106,6 +106,211 @@ let Predicates = [HasCMOV, HasNDD] in {
def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
(CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
}
+
+// Create pseudo instruction and do the pattern matching to them.
+// We use a machine pass to lower these pseudos into cmov, in order
+// to avoid backend optimizations
+let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in {
+
+ multiclass CTSELECT<X86TypeInfo t> {
+ // register-only
+ let isCommutable = 0, SchedRW = [WriteCMOV], Predicates = [HasNativeCMOV],
+ AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
+ def rr : PseudoI<(outs t.RegClass:$dst),
+ (ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond),
+ [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, t.RegClass:$src2, timm:$cond, EFLAGS))]>;
+ }
+
+ // register-memory
+ let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], Predicates = [HasNativeCMOV],
+ AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
+ def rm : PseudoI<(outs t.RegClass:$dst),
+ (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond),
+ [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>;
+ }
+ }
+}
+
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+ let Constraints = "$dst = $src1" in {
+ defm CTSELECT16 : CTSELECT<Xi16>;
+ defm CTSELECT32 : CTSELECT<Xi32>;
+ defm CTSELECT64 : CTSELECT<Xi64>;
+ }
+}
+
+// CTSELECT_VEC base class
+class CTSELECT_VEC<RegisterClass VRc, RegisterClass GRc>
+ : PseudoI<
+ (outs VRc:$dst, VRc:$tmpx, GRc:$tmpg),
+ (ins VRc:$t, VRc:$f, i8imm:$cond),
+ []
+ > {
+ let Uses = [EFLAGS];
+ let isPseudo = 1;
+ let isNotDuplicable = 1;
+ let hasSideEffects = 1;
+ let AsmString = "ctselect\t$dst, $f, $t, $cond";
+ let SchedRW = [];
+}
+
+// Width-specific class aliases
+class CTSELECT_VEC128 : CTSELECT_VEC<VR128, GR32>;
+class CTSELECT_VEC128X : CTSELECT_VEC<VR128X, GR32>;
+class CTSELECT_VEC256 : CTSELECT_VEC<VR256, GR32>;
+class CTSELECT_VEC512 : CTSELECT_VEC<VR512, GR32>;
+
+
+//===----------------------------------------------------------------------===//
+// 128-bit pseudos (SSE2 baseline; we use PXOR/PAND/MOVD/PSHUFD in the expander)
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE1] in {
+
+ def CTSELECT_V4F32 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+}
+
+let Predicates = [HasSSE2] in {
+
+ def CTSELECT_V2F64 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V4I32 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V2I64 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V8I16 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V16I8 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+
+ // If your build has v8f16, keep this; otherwise comment it out.
+ def CTSELECT_V8F16 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+}
+
+let Predicates = [HasAVX] in {
+
+ def CTSELECT_V4F32X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V2F64X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V4I32X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V2I64X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V8I16X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V16I8X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+
+ // If your build has v8f16, keep this; otherwise comment it out.
+ def CTSELECT_V8F16X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// 256-bit pseudos
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVX] in {
+
+ def CTSELECT_V8F32 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V4F64 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V8I32 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V4I64 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V16I16 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V32I8 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+
+ // If your build has v16f16, keep this; otherwise comment it out.
+ def CTSELECT_V16F16 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Selection patterns: X86ctselect(...), EFLAGS -> CTSELECT_V*
+//
+// NOTE:
+// * The SDNode carries Glue from CMP/TEST (due to SDNPInGlue).
+// * We list EFLAGS explicitly in the pattern (X86 style) to model the arch read.
+// * Temps (tmpx/tmpy,tmpg) are not in the pattern; they’re outs allocated by RA.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE1] in {
+
+ // 128-bit float (bitwise-equivalent ops in expander)
+ def : Pat<(v4f32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V4F32 VR128:$t, VR128:$f, timm:$cc)>;
+}
+
+let Predicates = [HasSSE2] in {
+
+ // 128-bit integer
+ def : Pat<(v4i32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V4I32 VR128:$t, VR128:$f, timm:$cc)>;
+ def : Pat<(v2i64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V2I64 VR128:$t, VR128:$f, timm:$cc)>;
+ def : Pat<(v8i16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V8I16 VR128:$t, VR128:$f, timm:$cc)>;
+ def : Pat<(v16i8 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V16I8 VR128:$t, VR128:$f, timm:$cc)>;
+ def : Pat<(v2f64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V2F64 VR128:$t, VR128:$f, timm:$cc)>;
+
+ // 128-bit f16 (optional)
+ def : Pat<(v8f16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V8F16 VR128:$t, VR128:$f, timm:$cc)>;
+}
+
+let Predicates = [HasAVX] in {
+
+ // 256-bit integer
+ def : Pat<(v8i32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V8I32 VR256:$t, VR256:$f, timm:$cc)>;
+ def : Pat<(v4i64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V4I64 VR256:$t, VR256:$f, timm:$cc)>;
+ def : Pat<(v16i16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V16I16 VR256:$t, VR256:$f, timm:$cc)>;
+ def : Pat<(v32i8 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V32I8 VR256:$t, VR256:$f, timm:$cc)>;
+
+ // 256-bit float (bitwise-equivalent ops in expander)
+ def : Pat<(v8f32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V8F32 VR256:$t, VR256:$f, timm:$cc)>;
+ def : Pat<(v4f64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V4F64 VR256:$t, VR256:$f, timm:$cc)>;
+
+ // 256-bit f16 (optional)
+ def : Pat<(v16f16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V16F16 VR256:$t, VR256:$f, timm:$cc)>;
+}
+
let Predicates = [HasCMOV, HasCF] in {
def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS),
(CFCMOV16rr GR16:$src1, (inv_cond_XFORM timm:$cond))>;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ec31675731b79..d40c91b52c808 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -693,6 +693,87 @@ def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
(CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+// CTSELECT
+// Enhanced CTSELECT pseudos for i386 with temporary register allocation
+// These use a two-phase approach:
+// 1. Custom inserter materializes condition byte from EFLAGS
+// 2. Post-RA expansion generates constant-time instruction bundles
+
+let isPseudo = 1, isNotDuplicable = 1 in {
+ // Phase 1: Initial pseudos that consume EFLAGS (via custom inserter)
+ // These are matched by patterns and convert EFLAGS to condition byte
+ class CTSELECT_I386_INITIAL<RegisterClass RC, ValueType VT>
+ : PseudoI<(outs RC:$dst),
+ (ins RC:$src1, RC:$src2, i8imm:$cond),
+ [(set RC:$dst, (VT(X86ctselect RC:$src1, RC:$src2, timm:$cond,
+ EFLAGS)))]> {
+ let Uses = [EFLAGS];
+ let Defs = [EFLAGS];
+ let usesCustomInserter = 1;
+ let hasNoSchedulingInfo = 1;
+ }
+
+ // Phase 2: Internal pseudos with pre-materialized condition byte (post-RA expansion)
+ // These generate the actual constant-time instruction bundles
+ class CTSELECT_I386_INTERNAL<RegisterClass RC, RegisterClass ByteRC>
+ : PseudoI<(outs RC:$dst, ByteRC:$tmp_byte, RC:$tmp_mask),
+ (ins RC:$src1, RC:$src2, ByteRC:$cond_byte), []> {
+ let hasNoSchedulingInfo = 1;
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmp_byte, at earlyclobber $tmp_mask";
+ let Defs = [EFLAGS]; // NEG instruction in post-RA expansion clobbers EFLAGS
+ }
+}
+
+// Phase 1 pseudos for non-CMOV targets (custom inserter materializes condition)
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+ let Predicates = [NoNativeCMOV] in {
+ def CTSELECT_I386_GR8rr : CTSELECT_I386_INITIAL<GR8, i8>;
+ def CTSELECT_I386_GR16rr : CTSELECT_I386_INITIAL<GR16, i16>;
+ def CTSELECT_I386_GR32rr : CTSELECT_I386_INITIAL<GR32, i32>;
+ }
+}
+
+// Phase 2 pseudos (post-RA expansion with pre-materialized condition byte)
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+ let Predicates = [NoNativeCMOV] in {
+ def CTSELECT_I386_INT_GR8rr :
+ CTSELECT_I386_INTERNAL<GR8, GR8>;
+ def CTSELECT_I386_INT_GR16rr :
+ CTSELECT_I386_INTERNAL<GR16, GR8>;
+ def CTSELECT_I386_INT_GR32rr :
+ CTSELECT_I386_INTERNAL<GR32, GR8>;
+ }
+}
+
+let hasSideEffects = 1,
+ ForceDisassemble = 1,
+ Constraints = "$dst = $src1" in {
+
+ let Predicates = [FPStackf32] in
+ def CTSELECT_I386_FP32rr : CTSELECT_I386_INITIAL<RFP32, f32>;
+
+ let Predicates = [FPStackf64] in
+ def CTSELECT_I386_FP64rr : CTSELECT_I386_INITIAL<RFP64, f64>;
+
+ def CTSELECT_I386_FP80rr : CTSELECT_I386_INITIAL<RFP80, f80>;
+}
+
+// Pattern matching for non-native-CMOV CTSELECT (routes to custom inserter for condition materialization)
+// NoNativeCMOV ensures these patterns are used when actual CMOV instruction is not available
+// even if canUseCMOV() is true (e.g., i386 with SSE which can emulate CMOV)
+let Predicates = [NoNativeCMOV] in {
+ def : Pat<(i8(X86ctselect GR8:$src1, GR8:$src2, timm:$cond, EFLAGS)),
+ (CTSELECT_I386_GR8rr GR8:$src1, GR8:$src2, timm:$cond)>;
+
+ def : Pat<(i16(X86ctselect GR16:$src1, GR16:$src2, timm:$cond, EFLAGS)),
+ (CTSELECT_I386_GR16rr GR16:$src1, GR16:$src2, timm:$cond)>;
+
+ def : Pat<(i32(X86ctselect GR32:$src1, GR32:$src2, timm:$cond, EFLAGS)),
+ (CTSELECT_I386_GR32rr GR32:$src1, GR32:$src2, timm:$cond)>;
+
+ // i64 patterns handled automatically by type legalization
+}
+
//===----------------------------------------------------------------------===//
// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index 116986a0fffea..4c9e5bae3b46c 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -28,6 +28,10 @@ def SDTX86Cmov : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+def SDTX86CtSelect : SDTypeProfile<1, 4,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
// Unary and binary operator instructions that set EFLAGS as a side-effect.
def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
[SDTCisSameAs<0, 2>,
@@ -151,6 +155,7 @@ def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>;
def X86cload : SDNode<"X86ISD::CLOAD", SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def X86cstore : SDNode<"X86ISD::CSTORE", SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86ctselect: SDNode<"X86ISD::CTSELECT", SDTX86CtSelect, [SDNPInGlue]>;
def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
[SDNPHasChain]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 6b2a7a4ec3583..765db86ffafb3 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -475,6 +475,556 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
return false;
}
+struct CtSelectInstructions {
+ unsigned PAndOpc;
+ unsigned PAndnOpc;
+ unsigned POrOpc;
+ unsigned BroadcastOpc;
+ unsigned IntMoveOpc;
+ unsigned MoveOpc;
+ bool Use256;
+ bool UseBlendInstr;
+};
+
+static CtSelectInstructions
+getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) {
+ CtSelectInstructions Instructions = {};
+
+ switch (Opcode) {
+ case X86::CTSELECT_V2F64:
+ if (Subtarget.hasSSE2()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVAPDrr;
+ Instructions.UseBlendInstr = true;
+ } else {
+ llvm_unreachable("Double precision vectors require SSE2");
+ }
+ break;
+ case X86::CTSELECT_V4F32:
+ if (Subtarget.hasSSE41()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVAPSrr;
+ Instructions.UseBlendInstr = true;
+ } else if (Subtarget.hasSSE2()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVAPSrr;
+ } else {
+ // fallback to SSE1, only support four 32-bit single precision
+ // floating-point values
+ Instructions.PAndOpc = X86::ANDPSrr;
+ Instructions.PAndnOpc = X86::ANDNPSrr;
+ Instructions.POrOpc = X86::ORPSrr;
+ Instructions.BroadcastOpc = X86::SHUFPSrri;
+ Instructions.IntMoveOpc = X86::MOVSS2DIrr;
+ Instructions.MoveOpc = X86::MOVAPSrr;
+ }
+ break;
+ case X86::CTSELECT_V4I32:
+ case X86::CTSELECT_V2I64:
+ case X86::CTSELECT_V8I16:
+ case X86::CTSELECT_V16I8:
+ if (Subtarget.hasSSE2()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVDQArr;
+ } else {
+ llvm_unreachable("Integer vector operations require SSE2");
+ }
+ break;
+ case X86::CTSELECT_V8F16:
+ if (Subtarget.hasSSE2()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVDQArr;
+ } else {
+ llvm_unreachable("FP16 vector operations require SSE2");
+ }
+ break;
+ case X86::CTSELECT_V4F32X:
+ case X86::CTSELECT_V4I32X:
+ case X86::CTSELECT_V2F64X:
+ case X86::CTSELECT_V2I64X:
+ case X86::CTSELECT_V8I16X:
+ case X86::CTSELECT_V16I8X:
+ case X86::CTSELECT_V8F16X:
+ if (Subtarget.hasAVX()) {
+ Instructions.PAndOpc = X86::VPANDrr;
+ Instructions.PAndnOpc = X86::VPANDNrr;
+ Instructions.POrOpc = X86::VPORrr;
+ Instructions.BroadcastOpc = X86::VPSHUFDri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc = (Opcode == X86::CTSELECT_V4F32X) ? X86::VMOVAPSrr
+ : (Opcode == X86::CTSELECT_V2F64X)
+ ? X86::VMOVAPDrr
+ : X86::VMOVDQArr;
+ } else {
+ llvm_unreachable("AVX variants require AVX support");
+ }
+ break;
+ case X86::CTSELECT_V8F32:
+ case X86::CTSELECT_V8I32:
+ if (Subtarget.hasAVX()) {
+ Instructions.PAndOpc = X86::VPANDYrr;
+ Instructions.PAndnOpc = X86::VPANDNYrr;
+ Instructions.POrOpc = X86::VPORYrr;
+ Instructions.BroadcastOpc = X86::VPERMILPSYri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc =
+ (Opcode == X86::CTSELECT_V8F32) ? X86::VMOVAPSYrr : X86::VMOVDQAYrr;
+ Instructions.Use256 = true;
+ } else {
+ llvm_unreachable("256-bit vectors require AVX");
+ }
+ break;
+ case X86::CTSELECT_V4F64:
+ case X86::CTSELECT_V4I64:
+ if (Subtarget.hasAVX()) {
+ Instructions.PAndOpc = X86::VPANDYrr;
+ Instructions.PAndnOpc = X86::VPANDNYrr;
+ Instructions.POrOpc = X86::VPORYrr;
+ Instructions.BroadcastOpc = X86::VPERMILPDYri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc =
+ (Opcode == X86::CTSELECT_V4F64) ? X86::VMOVAPDYrr : X86::VMOVDQAYrr;
+ Instructions.Use256 = true;
+ } else {
+ llvm_unreachable("256-bit vectors require AVX");
+ }
+ break;
+ case X86::CTSELECT_V16I16:
+ case X86::CTSELECT_V32I8:
+ case X86::CTSELECT_V16F16:
+ if (Subtarget.hasAVX2()) {
+ Instructions.PAndOpc = X86::VPANDYrr;
+ Instructions.PAndnOpc = X86::VPANDNYrr;
+ Instructions.POrOpc = X86::VPORYrr;
+ Instructions.BroadcastOpc = X86::VPERMILPSYri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc = X86::VMOVDQAYrr;
+ Instructions.Use256 = true;
+ } else if (Subtarget.hasAVX()) {
+ Instructions.PAndOpc = X86::VPANDYrr;
+ Instructions.PAndnOpc = X86::VPANDNYrr;
+ Instructions.POrOpc = X86::VPORYrr;
+ Instructions.BroadcastOpc = X86::VPERMILPSYri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc = X86::VMOVDQAYrr;
+ Instructions.Use256 = true;
+ } else {
+ llvm_unreachable("256-bit integer vectors require AVX");
+ }
+ break;
+ default:
+ llvm_unreachable("Unexpected CTSELECT opcode");
+ }
+
+ return Instructions;
+}
+
+bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ const DebugLoc &DL = MI.getDebugLoc();
+ auto Instruction = getCtSelectInstructions(Opcode, Subtarget);
+
+ MachineBasicBlock *MBB = MI.getParent();
+
+ // Operand layout matches the TableGen definition:
+ // (outs VR128:$dst, VR128:$tmpx, GR32:$tmpg),
+ // (ins VR128:$t, VR128:$f, i8imm:$cond)
+ Register Dst = MI.getOperand(0).getReg();
+ Register MaskReg = MI.getOperand(1).getReg(); // vector mask temp
+ Register TmpGPR = MI.getOperand(2).getReg(); // scalar mask temp (GPR32)
+ Register FalseVal = MI.getOperand(3).getReg(); // true_value
+ Register TrueVal = MI.getOperand(4).getReg(); // false_value
+ X86::CondCode CC = X86::CondCode(MI.getOperand(5).getImm()); // condition
+
+ MachineInstr *FirstInstr = nullptr;
+ MachineInstr *LastInstr = nullptr;
+ auto recordInstr = [&](MachineInstrBuilder MIB) {
+ MachineInstr *NewMI = MIB.getInstr();
+ LastInstr = NewMI;
+ if (!FirstInstr)
+ FirstInstr = NewMI;
+ };
+
+ // Create scalar mask in tempGPR and broadcast to vector mask
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR)
+ .addImm(0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ auto SubReg = TRI->getSubReg(TmpGPR, X86::sub_8bit);
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::SETCCr))
+ .addReg(SubReg)
+ .addImm(CC)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // Zero-extend byte to 32-bit register (movzbl %al, %eax)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVZX32rr8), TmpGPR)
+ .addReg(SubReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) {
+ // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31,
+ // %eax)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR)
+ .addReg(TmpGPR)
+ .addImm(31));
+ } else {
+ // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR)
+ .addReg(TmpGPR));
+ }
+
+ // Broadcast to TmpX (vector mask)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg)
+ .addReg(MaskReg)
+ .addReg(MaskReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // Move scalar mask to vector register
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.IntMoveOpc), MaskReg)
+ .addReg(TmpGPR)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ if (Instruction.Use256) {
+ // Broadcast to 256-bit vector register
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+ .addReg(MaskReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ } else {
+ if (Subtarget.hasSSE2() || Subtarget.hasAVX()) {
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+ .addReg(MaskReg)
+ .addImm(0x00)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ } else {
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+ .addReg(MaskReg)
+ .addReg(MaskReg)
+ .addImm(0x00)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ }
+ }
+
+ if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) {
+ // Use dedicated blend instructions for SSE4.1+
+ unsigned BlendOpc;
+ switch (Opcode) {
+ case X86::CTSELECT_V4F32:
+ BlendOpc = X86::BLENDVPSrr0;
+ break;
+ case X86::CTSELECT_V2F64:
+ BlendOpc = X86::BLENDVPDrr0;
+ break;
+ default:
+ // alias for pblendvb that takes xmm0 as implicit mask register
+ BlendOpc = X86::PBLENDVBrr0;
+ break;
+ }
+
+ // Check if XMM0 is used as one of source registers, if yes then save it
+ // in Dst register and update FalseVal and TrueVal to Dst register
+ bool DidSaveXMM0 = false;
+ Register SavedXMM0 = X86::XMM0;
+ if (FalseVal == X86::XMM0 || TrueVal == X86::XMM0) {
+ Register SrcXMM0 = (FalseVal == X86::XMM0) ? FalseVal : TrueVal;
+
+ // if XMM0 is one of the source registers, it will not match with Dst
+ // registers, so we need to move it to Dst register
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(SrcXMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // update FalseVal and TrueVal to Dst register
+ if (FalseVal == X86::XMM0)
+ FalseVal = Dst;
+ if (TrueVal == X86::XMM0)
+ TrueVal = Dst;
+
+ // update SavedXMM0 to Dst register
+ SavedXMM0 = Dst;
+
+ // set DidSaveXMM0 to true to indicate that we saved XMM0 into Dst
+ // register
+ DidSaveXMM0 = true;
+ } else if (MaskReg != X86::XMM0 && Dst != X86::XMM0) {
+
+ // if XMM0 is not allocated for any of the register, we stil need to save
+ // and restore it after using as mask register
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(X86::XMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ SavedXMM0 = Dst;
+ DidSaveXMM0 = true;
+ }
+
+ if (MaskReg != X86::XMM0) {
+ // BLENDV uses XMM0 as implicit mask register
+ // https://www.felixcloutier.com/x86/pblendvb
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge));
+
+ // move FalseVal to mask (use MaskReg as the dst of the blend)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), MaskReg)
+ .addReg(FalseVal)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // MaskReg := blend(MaskReg /*false*/, TrueVal /*true*/) ; mask in
+ // xmm0
+ recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), MaskReg)
+ .addReg(MaskReg)
+ .addReg(TrueVal)
+ .addReg(X86::XMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // restore XMM0 from SavedXMM0 if we saved it into Dst
+ if (DidSaveXMM0) {
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0)
+ .addReg(SavedXMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ }
+ // dst = result (now in MaskReg)
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(MaskReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ } else {
+ // move FalseVal to Dst register since MaskReg is XMM0 and Dst is not
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(FalseVal)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // Dst := blend(Dst /*false*/, TrueVal /*true*/) ; mask in
+ // xmm0
+ recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), Dst)
+ .addReg(Dst)
+ .addReg(TrueVal)
+ .addReg(X86::XMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ }
+ } else {
+
+ // dst = mask
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(MaskReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // mask &= true_val
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDrr), MaskReg)
+ .addReg(MaskReg)
+ .addReg(TrueVal)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // dst = ~mask & false_val
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDNrr), Dst)
+ .addReg(Dst)
+ .addReg(FalseVal)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // dst |= mask; (mask & t) | (~mask & f)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::PORrr), Dst)
+ .addReg(Dst)
+ .addReg(MaskReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ }
+
+ assert(FirstInstr && LastInstr && "Expected at least one expanded instruction");
+ auto BundleEnd = LastInstr->getIterator();
+ finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd));
+
+ MI.eraseFromParent();
+
+ return true;
+}
+
+bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // CTSELECT pseudo has: (outs dst), (ins true_val, false_val, cond)
+ MachineOperand &OperandRes = MI.getOperand(0); // destination register
+ MachineOperand &OperandTrue = MI.getOperand(1); // true value
+ MachineOperand &OperandCond = MI.getOperand(3); // condition code
+
+ assert(OperandTrue.isReg() && OperandRes.isReg() && OperandCond.isImm() &&
+ "Invalid operand types");
+ assert(OperandTrue.getReg() == OperandRes.getReg() &&
+ "Result register different from True register");
+
+ assert(Subtarget.hasCMOV() && "target does not support CMOV instructions");
+
+ unsigned Opcode = 0;
+
+ switch (MI.getOpcode()) {
+ case X86::CTSELECT16rr:
+ Opcode = X86::CMOV16rr;
+ break;
+ case X86::CTSELECT32rr:
+ Opcode = X86::CMOV32rr;
+ break;
+ case X86::CTSELECT64rr:
+ Opcode = X86::CMOV64rr;
+ break;
+ case X86::CTSELECT16rm:
+ Opcode = X86::CMOV16rm;
+ break;
+ case X86::CTSELECT32rm:
+ Opcode = X86::CMOV32rm;
+ break;
+ case X86::CTSELECT64rm:
+ Opcode = X86::CMOV64rm;
+ break;
+ default:
+ llvm_unreachable("Invalid CTSELECT opcode");
+ }
+
+ if (!Subtarget.hasCMOV()) {
+ llvm_unreachable("target does not support cmov");
+ }
+
+ // Build CMOV instruction: copy the first 3 operands (dst, true, false)
+ // and add condition code
+ MachineInstrBuilder CmovBuilder = BuildMI(*MBB, MI, DL, get(Opcode));
+ for (unsigned i = 0u; i < MI.getNumOperands(); ++i) { // Copy
+ CmovBuilder.add(MI.getOperand(i));
+ }
+
+ // Remove the original CTSELECT instruction
+ MI.eraseFromParent();
+ return true;
+}
+
+/// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time)
+/// These internal pseudos receive a pre-materialized condition byte from the
+/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization.
+bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // CTSELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask),
+ // (ins src1, src2, cond_byte)
+ // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent
+ Register DstReg = MI.getOperand(0).getReg();
+ Register TmpByteReg = MI.getOperand(1).getReg();
+ Register TmpMaskReg = MI.getOperand(2).getReg();
+ Register Src1Reg = MI.getOperand(3).getReg();
+ Register Src2Reg = MI.getOperand(4).getReg();
+ Register CondByteReg = MI.getOperand(5).getReg(); // Pre-materialized condition byte
+
+ // Determine instruction opcodes based on register width
+ unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp;
+ if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) {
+ MovZXOp = 0; // No zero-extend needed for GR8
+ NegOp = X86::NEG8r;
+ MovOp = X86::MOV8rr;
+ AndOp = X86::AND8rr;
+ NotOp = X86::NOT8r;
+ OrOp = X86::OR8rr;
+ } else if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR16rr) {
+ MovZXOp = X86::MOVZX16rr8;
+ NegOp = X86::NEG16r;
+ MovOp = X86::MOV16rr;
+ AndOp = X86::AND16rr;
+ NotOp = X86::NOT16r;
+ OrOp = X86::OR16rr;
+ } else { // X86::CTSELECT_I386_INT_GR32rr
+ MovZXOp = X86::MOVZX32rr8;
+ NegOp = X86::NEG32r;
+ MovOp = X86::MOV32rr;
+ AndOp = X86::AND32rr;
+ NotOp = X86::NOT32r;
+ OrOp = X86::OR32rr;
+ }
+
+ // 7-instruction constant-time selection bundle (no SETCC inside):
+ // result = (true_val & mask) | (false_val & ~mask)
+ // The condition byte is already materialized, avoiding EFLAGS dependency
+
+ // Step 1: Copy pre-materialized condition byte to TmpByteReg
+ // This allows the bundle to work with allocated temporaries
+ auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg)
+ .addReg(CondByteReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+ auto BundleStart = I1->getIterator();
+
+ // Step 2: Zero-extend condition byte to register width (0 or 1)
+ if (MI.getOpcode() != X86::CTSELECT_I386_INT_GR8rr) {
+ BuildMI(*MBB, MI, DL, get(MovZXOp), TmpMaskReg)
+ .addReg(TmpByteReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+ }
+
+ // Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...)
+ Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg;
+ BuildMI(*MBB, MI, DL, get(NegOp), MaskReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Step 4,5: Apply mask to true value - copy src1 to dest, then AND with mask
+ BuildMI(*MBB, MI, DL, get(MovOp), DstReg)
+ .addReg(Src1Reg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ BuildMI(*MBB, MI, DL, get(AndOp), DstReg)
+ .addReg(DstReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Step 6: Create inverted mask inline (~mask)
+ BuildMI(*MBB, MI, DL, get(NotOp), MaskReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Step 7: Apply inverted mask to false value - reuse mask register directly
+ BuildMI(*MBB, MI, DL, get(AndOp), MaskReg)
+ .addReg(MaskReg)
+ .addReg(Src2Reg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Step 8: Final result: (src1 & mask) | (src2 & ~mask)
+ auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg)
+ .addReg(DstReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Bundle all generated instructions for atomic execution before removing MI
+ auto BundleEnd = std::next(LI->getIterator());
+ if (BundleStart != BundleEnd) {
+ // Only bundle if we have multiple instructions
+ finalizeBundle(*MBB, BundleStart, BundleEnd);
+ }
+
+ // TODO: Optimization opportunity - The register allocator may choose callee-saved
+ // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary
+ // save/restore overhead. Consider constraining these to caller-saved register
+ // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve
+ // constant-time performance by eliminating prologue/epilogue instructions.
+
+ // Remove the original pseudo instruction
+ MI.eraseFromParent();
+ return true;
+}
+
static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) {
switch (Opcode) {
default:
@@ -6402,6 +6952,43 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::ADD64ri32_DB:
MIB->setDesc(get(X86::OR64ri32));
break;
+
+ case X86::CTSELECT64rr:
+ case X86::CTSELECT32rr:
+ case X86::CTSELECT16rr:
+ case X86::CTSELECT64rm:
+ case X86::CTSELECT32rm:
+ case X86::CTSELECT16rm:
+ // These CTSELECT pseudos are only selected when CMOV is available
+ // Pattern matching ensures we use CTSELECT_I386 when CMOV is not available
+ return expandCtSelectWithCMOV(MI);
+
+ // non-cmov CTSELECT expansion (post-RA, constant-time)
+ // These are the internal pseudos with pre-materialized condition byte
+ case X86::CTSELECT_I386_INT_GR8rr:
+ case X86::CTSELECT_I386_INT_GR16rr:
+ case X86::CTSELECT_I386_INT_GR32rr:
+ return expandCtSelectIntWithoutCMOV(MI);
+
+ case X86::CTSELECT_V2F64:
+ case X86::CTSELECT_V4F32:
+ case X86::CTSELECT_V2I64:
+ case X86::CTSELECT_V4I32:
+ case X86::CTSELECT_V8I16:
+ case X86::CTSELECT_V16I8:
+ case X86::CTSELECT_V2F64X:
+ case X86::CTSELECT_V4F32X:
+ case X86::CTSELECT_V2I64X:
+ case X86::CTSELECT_V4I32X:
+ case X86::CTSELECT_V8I16X:
+ case X86::CTSELECT_V16I8X:
+ case X86::CTSELECT_V4I64:
+ case X86::CTSELECT_V8I32:
+ case X86::CTSELECT_V16I16:
+ case X86::CTSELECT_V32I8:
+ case X86::CTSELECT_V4F64:
+ case X86::CTSELECT_V8F32:
+ return expandCtSelectVector(MI);
}
return false;
}
@@ -10800,27 +11387,39 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
if (!ST.hasSSE1())
return;
- BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
+ // PXOR is safe to use because it doesn't affect flags.
+ BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
} else if (X86::VR256RegClass.contains(Reg)) {
// YMM#
if (!ST.hasAVX())
return;
- BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
+ // VPXOR is safe to use because it doesn't affect flags.
+ BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
} else if (X86::VR512RegClass.contains(Reg)) {
// ZMM#
if (!ST.hasAVX512())
return;
- BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
+ // VPXORY is safe to use because it doesn't affect flags.
+ BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
} else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
X86::VK16RegClass.contains(Reg)) {
if (!ST.hasVLX())
return;
- unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
- BuildMI(MBB, Iter, DL, get(Op), Reg);
+ // KXOR is safe to use because it doesn't affect flags.
+ unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk;
+ BuildMI(MBB, Iter, DL, get(Op), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
}
}
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 5f75559bd9598..ebd7e070d5fe8 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -724,6 +724,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
int &FrameIndex) const;
+ /// Expand the CTSELECT pseudo-instructions.
+ bool expandCtSelectWithCMOV(MachineInstr &MI) const;
+ bool expandCtSelectIntWithoutCMOV(MachineInstr &MI) const;
+
+ bool expandCtSelectVector(MachineInstr &MI) const;
+
/// Returns true iff the routine could find two commutable operands in the
/// given machine instruction with 3 vector inputs.
/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 98104a6fad1a9..6b585a5b0b436 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -49,6 +49,11 @@ def HasZU : Predicate<"Subtarget->hasZU()">;
def HasCF : Predicate<"Subtarget->hasCF()">;
def HasCMOV : Predicate<"Subtarget->canUseCMOV()">;
def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">;
+// Predicates for native CMOV instruction (checks hasCMOV(), not canUseCMOV())
+// HasCMOV may be true even without native CMOV (e.g., via SSE emulation)
+// Use HasNativeCMOV/NoNativeCMOV for constant-time code that requires actual CMOV
+def HasNativeCMOV : Predicate<"Subtarget->hasCMOV()">;
+def NoNativeCMOV : Predicate<"!Subtarget->hasCMOV()">;
def HasNOPL : Predicate<"Subtarget->hasNOPL()">;
def HasMMX : Predicate<"Subtarget->hasMMX()">;
def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 9a76abcd351bf..66c9d75053640 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -617,10 +617,11 @@ void X86PassConfig::addPreEmitPass2() {
// ObjC runtime functions present in the module.
const Function &F = MF.getFunction();
const Module *M = F.getParent();
- return M->getModuleFlag("kcfi") ||
+ return M->getModuleFlag("kcfi") || F.hasFnAttribute("ct-select") ||
(TT.isOSDarwin() &&
(M->getFunction("objc_retainAutoreleasedReturnValue") ||
- M->getFunction("objc_unsafeClaimAutoreleasedReturnValue")));
+ M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) ||
+ F.hasFnAttribute("ct-select");
}));
// Analyzes and emits pseudos to support Win x64 Unwind V2. This pass must run
diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll
new file mode 100644
index 0000000000000..0797265972a1f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll
@@ -0,0 +1,409 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32
+
+; Test ct.select edge cases and corner cases
+
+; Test with very large integers
+define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) {
+; X64-LABEL: test_ctselect_i128:
+; X64: # %bb.0:
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
+; X64-NEXT: cmovneq %rdx, %r8
+; X64-NEXT: movq %r8, %rdx
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_i128:
+; X32: # %bb.0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edi
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, 12(%eax)
+; X32-NEXT: movl %edx, 8(%eax)
+; X32-NEXT: movl %edi, 4(%eax)
+; X32-NEXT: movl %esi, (%eax)
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: retl $4
+ %result = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b)
+ ret i128 %result
+}
+
+; Test with small integer types
+define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+; X64-LABEL: test_ctselect_i1:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_i1:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: # kill: def $al killed $al killed $eax
+; X32-NEXT: retl
+ %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+ ret i1 %result
+}
+
+; Test with extremal values
+define i32 @test_ctselect_extremal_values(i1 %cond) {
+; X64-LABEL: test_ctselect_extremal_values:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_extremal_values:
+; X32: # %bb.0:
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X32-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: retl
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648)
+ ret i32 %result
+}
+
+; Test with floating point special values
+define float @test_ctselect_f32_special_values(i1 %cond) {
+; X64-LABEL: test_ctselect_f32_special_values:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT: movl $2139095040, %ecx # imm = 0x7F800000
+; X64-NEXT: cmovnel %eax, %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_f32_special_values:
+; X32: # %bb.0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: sete %al
+; X32-NEXT: movl {{\.?LCPI[0-9]+_[0-9]+}}, %ecx
+; X32-NEXT: movl {{\.?LCPI[0-9]+_[0-9]+}}, %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
+ ret float %result
+}
+
+define double @test_ctselect_f64_special_values(i1 %cond) {
+; X64-LABEL: test_ctselect_f64_special_values:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; X64-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+; X64-NEXT: cmovneq %rax, %rcx
+; X64-NEXT: movq %rcx, %xmm0
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_f64_special_values:
+; X32: # %bb.0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: subl $24, %esp
+; X32-NEXT: .cfi_def_cfa_offset 36
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X32-NEXT: sete %al
+; X32-NEXT: fxch %st(1)
+; X32-NEXT: fstpl {{[0-9]+}}(%esp)
+; X32-NEXT: fstpl (%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl (%esp), %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: fldl {{[0-9]+}}(%esp)
+; X32-NEXT: addl $24, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: retl
+ %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000)
+ ret double %result
+}
+
+; Test with null pointers
+define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+; X64-LABEL: test_ctselect_null_ptr:
+; X64: # %bb.0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_null_ptr:
+; X32: # %bb.0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+ ret ptr %result
+}
+
+; Test with function pointers
+define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+; X64-LABEL: test_ctselect_function_ptr:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_function_ptr:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+ ret ptr %result
+}
+
+; Test with volatile loads
+define i32 @test_ctselect_volatile_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_volatile_load:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rsi), %ecx
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_volatile_load:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl (%ecx), %ecx
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: retl
+ %a = load volatile i32, ptr %p1
+ %b = load volatile i32, ptr %p2
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test with atomic loads
+define i32 @test_ctselect_atomic_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_atomic_load:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rsi), %ecx
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_atomic_load:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl (%ecx), %ecx
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: retl
+ %a = load atomic i32, ptr %p1 acquire, align 4
+ %b = load atomic i32, ptr %p2 acquire, align 4
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test with condition from icmp on pointers
+define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_ptr_cmp:
+; X64: # %bb.0:
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: cmpq %rsi, %rdi
+; X64-NEXT: sete %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovneq %rdx, %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_ptr_cmp:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: sete %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+ %cmp = icmp eq ptr %p1, %p2
+ %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b)
+ ret ptr %result
+}
+
+; Test with struct pointer types (struct types themselves may not be directly supported)
+%struct.pair = type { i32, i32 }
+
+define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_struct_ptr:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_struct_ptr:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+ ret ptr %result
+}
+
+; Test with deeply nested conditions (stress test for instruction selection)
+define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+; X64-LABEL: test_ctselect_deeply_nested:
+; X64: # %bb.0:
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %r11d
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %r8d, %r9d
+; X64-NEXT: testb $1, %sil
+; X64-NEXT: cmovnel %r9d, %r11d
+; X64-NEXT: testb $1, %dl
+; X64-NEXT: cmovnel %r11d, %r10d
+; X64-NEXT: testb $1, %cl
+; X64-NEXT: cmovnel %r10d, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_deeply_nested:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_offset %esi, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %esi, %edx
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %edx, %ecx
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: retl
+ %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+ %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+ %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+ %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e)
+ ret i32 %sel4
+}
+
+; Test with misaligned loads
+define i32 @test_ctselect_misaligned_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_misaligned_load:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel (%rsi), %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_misaligned_load:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel (%ecx), %eax
+; X32-NEXT: retl
+ %a = load i32, ptr %p1, align 1
+ %b = load i32, ptr %p2, align 1
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
+declare i128 @llvm.ct.select.i128(i1, i128, i128)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
new file mode 100644
index 0000000000000..ea943307c644f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Comprehensive CTSELECT tests for i386 targets with floating-point types
+; - Without CMOV: constant-time implementation using FP->int conversion + existing post-RA CTSELECT
+; - With CMOV: CMOV-based implementation
+; - Verifies security properties: no conditional branches, constant execution time
+; Strategy: FP values stored to memory, converted to integers, CTSELECT on integers, converted back to FP
+
+; Test basic f32 functionality
+define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+ ret float %result
+}
+
+; Test f32 with different condition codes
+define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_eq:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fucompp
+; I386-NOCMOV-NEXT: fnstsw %ax
+; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax
+; I386-NOCMOV-NEXT: sahf
+; I386-NOCMOV-NEXT: setnp %al
+; I386-NOCMOV-NEXT: sete %cl
+; I386-NOCMOV-NEXT: testb %al, %cl
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_eq:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fucompi %st(1), %st
+; I386-CMOV-NEXT: fstp %st(0)
+; I386-CMOV-NEXT: setnp %al
+; I386-CMOV-NEXT: sete %cl
+; I386-CMOV-NEXT: testb %al, %cl
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %cmp = fcmp oeq float %x, %y
+ %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b)
+ ret float %result
+}
+
+; Test basic f64 functionality
+define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f64_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $8, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldl (%esp)
+; I386-NOCMOV-NEXT: addl $8, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f64_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $8, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldl (%esp)
+; I386-CMOV-NEXT: addl $8, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+ ret double %result
+}
+
+; Test basic x86_fp80 functionality
+define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f80_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $12, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt (%esp)
+; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f80_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldt (%esp)
+; I386-CMOV-NEXT: addl $12, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
+ ret x86_fp80 %result
+}
+
+; Test f32 with complex conditions
+define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_gt:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fucompp
+; I386-NOCMOV-NEXT: fnstsw %ax
+; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax
+; I386-NOCMOV-NEXT: sahf
+; I386-NOCMOV-NEXT: seta %al
+; I386-NOCMOV-NEXT: testb %al, %al
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_gt:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fucompi %st(1), %st
+; I386-CMOV-NEXT: fstp %st(0)
+; I386-CMOV-NEXT: seta %al
+; I386-CMOV-NEXT: testb %al, %al
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %cmp = fcmp ogt float %x, %y
+ %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b)
+ ret float %result
+}
+
+; Test constant-time properties: verify no branches in generated code
+define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_no_branches:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_no_branches:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+ ret float %result
+}
+
+; Test that BUNDLE directives are present for constant-time guarantees
+define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_bundled:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_bundled:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+ ret float %result
+}
+
+; Test edge case: NaN handling
+define float @test_ctselect_f32_nan(i1 %cond) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_nan:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $12, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; I386-NOCMOV-NEXT: fldz
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: fxch %st(1)
+; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstps (%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl (%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_nan:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; I386-CMOV-NEXT: fldz
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: fxch %st(1)
+; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstps (%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl (%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: addl $12, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %nan = bitcast i32 2139095040 to float ; 0x7F800000 = +inf
+ %zero = bitcast i32 0 to float
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %nan, float %zero)
+ ret float %result
+}
+
+; Test memory alignment for f80
+define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f80_alignment:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $12, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt (%esp)
+; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f80_alignment:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldt (%esp)
+; I386-CMOV-NEXT: addl $12, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
+ ret x86_fp80 %result
+}
+
+; Stress test: multiple CTSELECT operations
+define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float %b, float %c, float %d) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_multiple:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $8, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $8, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_multiple:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $8, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $8, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %sel1 = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b)
+ %sel2 = call float @llvm.ct.select.f32(i1 %cond2, float %sel1, float %c)
+ ret float %sel2
+}
+
+; Declare intrinsics
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare x86_fp80 @llvm.ct.select.f80(i1, x86_fp80, x86_fp80)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
new file mode 100644
index 0000000000000..bc7980c357e0e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
@@ -0,0 +1,428 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx < %s | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+cmov,+mmx < %s | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx -verify-machineinstrs < %s | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Test constant-time selection with MMX intrinsics to exercise VR64 CTSELECT
+; These tests use MMX intrinsics to create <1 x i64> values that get allocated to VR64 registers
+
+; Test MMX ct.select using paddd intrinsic to force VR64 allocation
+define <1 x i64> @test_mmx_ctselect_with_paddd(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_paddd:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: subl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT: .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT: .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %bl
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: sete %bh
+; I386-NOCMOV-NEXT: movb %bh, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %esi, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %ecx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %ecx
+; I386-NOCMOV-NEXT: andl %esi, %ecx
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: orl %esi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT: paddd %mm0, %mm0
+; I386-NOCMOV-NEXT: movq %mm0, (%esp)
+; I386-NOCMOV-NEXT: movl (%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: addl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: popl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_paddd:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: subl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %dl
+; I386-CMOV-NEXT: testb %dl, %dl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT: paddd %mm0, %mm0
+; I386-CMOV-NEXT: movq %mm0, (%esp)
+; I386-CMOV-NEXT: movl (%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: addl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT: retl
+ %mmx_a = bitcast i64 %a to <1 x i64>
+ %mmx_b = bitcast i64 %b to <1 x i64>
+ %cmp = icmp ne i32 %cond, 0
+ %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+ %result = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %sel, <1 x i64> %sel)
+ ret <1 x i64> %result
+}
+
+; Test MMX ct.select using psllw intrinsic
+define <1 x i64> @test_mmx_ctselect_with_psllw(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_psllw:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: subl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT: .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT: .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %bl
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: sete %bh
+; I386-NOCMOV-NEXT: movb %bh, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %esi, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %ecx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %ecx
+; I386-NOCMOV-NEXT: andl %esi, %ecx
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: orl %esi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT: psllw %mm0, %mm0
+; I386-NOCMOV-NEXT: movq %mm0, (%esp)
+; I386-NOCMOV-NEXT: movl (%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: addl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: popl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_psllw:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: subl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %dl
+; I386-CMOV-NEXT: testb %dl, %dl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT: psllw %mm0, %mm0
+; I386-CMOV-NEXT: movq %mm0, (%esp)
+; I386-CMOV-NEXT: movl (%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: addl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT: retl
+ %mmx_a = bitcast i64 %a to <1 x i64>
+ %mmx_b = bitcast i64 %b to <1 x i64>
+ %cmp = icmp ne i32 %cond, 0
+ %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+ %result = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %sel, <1 x i64> %sel)
+ ret <1 x i64> %result
+}
+
+; Test nested MMX ct.selects with pand intrinsic
+define <1 x i64> @test_mmx_nested_ctselect_with_pand(i32 %cond1, i32 %cond2, i64 %a, i64 %b, i64 %c) {
+; I386-NOCMOV-LABEL: test_mmx_nested_ctselect_with_pand:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: subl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT: .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT: .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %bl
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: sete %bh
+; I386-NOCMOV-NEXT: movb %bh, %cl
+; I386-NOCMOV-NEXT: movzbl %cl, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edx, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %eax, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: sete %dl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movb %dl, %dh
+; I386-NOCMOV-NEXT: movzbl %dh, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %esi, %ebx
+; I386-NOCMOV-NEXT: andl %ebp, %ebx
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %eax, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %ebx
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %dl
+; I386-NOCMOV-NEXT: testb %dl, %dl
+; I386-NOCMOV-NEXT: sete %dh
+; I386-NOCMOV-NEXT: movb %dh, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %ecx, %esi
+; I386-NOCMOV-NEXT: andl %ebp, %esi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %ebx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %esi
+; I386-NOCMOV-NEXT: testb %dl, %dl
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; I386-NOCMOV-NEXT: movb %al, %dl
+; I386-NOCMOV-NEXT: movzbl %dl, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %ebx, %ecx
+; I386-NOCMOV-NEXT: andl %esi, %ecx
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: orl %esi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT: pand %mm0, %mm0
+; I386-NOCMOV-NEXT: movq %mm0, (%esp)
+; I386-NOCMOV-NEXT: movl (%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: addl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: popl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_mmx_nested_ctselect_with_pand:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %ebx
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-CMOV-NEXT: subl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 32
+; I386-CMOV-NEXT: .cfi_offset %esi, -12
+; I386-CMOV-NEXT: .cfi_offset %ebx, -8
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %bl
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %bh
+; I386-CMOV-NEXT: testb %bh, %bh
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %esi
+; I386-CMOV-NEXT: testb %bl, %bl
+; I386-CMOV-NEXT: cmovnel %esi, %edx
+; I386-CMOV-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel %ecx, %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT: pand %mm0, %mm0
+; I386-CMOV-NEXT: movq %mm0, (%esp)
+; I386-CMOV-NEXT: movl (%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: addl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-CMOV-NEXT: popl %ebx
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT: retl
+ %mmx_a = bitcast i64 %a to <1 x i64>
+ %mmx_b = bitcast i64 %b to <1 x i64>
+ %mmx_c = bitcast i64 %c to <1 x i64>
+ %cmp1 = icmp ne i32 %cond1, 0
+ %cmp2 = icmp ne i32 %cond2, 0
+ %sel1 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp2, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+ %sel2 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp1, <1 x i64> %sel1, <1 x i64> %mmx_c)
+ %result = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %sel2, <1 x i64> %sel2)
+ ret <1 x i64> %result
+}
+
+; Test MMX ct.select with por intrinsic
+define <1 x i64> @test_mmx_ctselect_with_por(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_por:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: subl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT: .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT: .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %bl
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: sete %bh
+; I386-NOCMOV-NEXT: movb %bh, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %esi, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %ecx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %ecx
+; I386-NOCMOV-NEXT: andl %esi, %ecx
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: orl %esi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT: por %mm0, %mm0
+; I386-NOCMOV-NEXT: movq %mm0, (%esp)
+; I386-NOCMOV-NEXT: movl (%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: addl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: popl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_por:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: subl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %dl
+; I386-CMOV-NEXT: testb %dl, %dl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT: por %mm0, %mm0
+; I386-CMOV-NEXT: movq %mm0, (%esp)
+; I386-CMOV-NEXT: movl (%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: addl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT: retl
+ %mmx_a = bitcast i64 %a to <1 x i64>
+ %mmx_b = bitcast i64 %b to <1 x i64>
+ %cmp = icmp ne i32 %cond, 0
+ %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+ %result = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %sel, <1 x i64> %sel)
+ ret <1 x i64> %result
+}
+
+; Declare MMX intrinsics
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>)
+
+; Declare constant-time selection intrinsic
+declare <1 x i64> @llvm.ct.select.v1i64(i1, <1 x i64>, <1 x i64>)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386.ll b/llvm/test/CodeGen/X86/ctselect-i386.ll
new file mode 100644
index 0000000000000..d7345f1121540
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Comprehensive CTSELECT tests for i386 targets with scalar integer types
+; - Without CMOV: constant-time implementation using post-RA expansion with bundled instructions
+; - With CMOV: CMOV-based implementation
+; - Verifies security properties: no conditional branches, constant execution time
+; All expansion happens post-RA for better optimization control and constant-time guarantees
+
+; Test basic i32 functionality
+define i32 @test_ctselect_i32_basic(i1 %cond, i32 %a, i32 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i32_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %eax
+; I386-NOCMOV-NEXT: andl %esi, %eax
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %ecx, %esi
+; I386-NOCMOV-NEXT: orl %esi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_i32_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: retl
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test i16 functionality
+define i16 @test_ctselect_i16_basic(i1 %cond, i16 %a, i16 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i16_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbw %bh, %si
+; I386-NOCMOV-NEXT: negw %si
+; I386-NOCMOV-NEXT: movw %dx, %ax
+; I386-NOCMOV-NEXT: andw %si, %ax
+; I386-NOCMOV-NEXT: notw %si
+; I386-NOCMOV-NEXT: andw %cx, %si
+; I386-NOCMOV-NEXT: orw %si, %ax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_i16_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnew {{[0-9]+}}(%esp), %ax
+; I386-CMOV-NEXT: retl
+ %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+ ret i16 %result
+}
+
+; Test i8 functionality
+define i8 @test_ctselect_i8_basic(i1 %cond, i8 %a, i8 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i8_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %ah
+; I386-NOCMOV-NEXT: movb %ah, %ch
+; I386-NOCMOV-NEXT: negb %ch
+; I386-NOCMOV-NEXT: movb %dl, %al
+; I386-NOCMOV-NEXT: andb %ch, %al
+; I386-NOCMOV-NEXT: notb %ch
+; I386-NOCMOV-NEXT: andb %cl, %ch
+; I386-NOCMOV-NEXT: orb %ch, %al
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_i8_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: # kill: def $al killed $al killed $eax
+; I386-CMOV-NEXT: retl
+ %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+ ret i8 %result
+}
+
+; Test security property: constant-time execution for cryptographic use case
+define i32 @test_crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwind {
+; I386-NOCMOV-LABEL: test_crypto_key_select:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %al
+; I386-NOCMOV-NEXT: testb %al, %al
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %eax
+; I386-NOCMOV-NEXT: andl %esi, %eax
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %ecx, %esi
+; I386-NOCMOV-NEXT: orl %esi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_crypto_key_select:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %cl
+; I386-CMOV-NEXT: testb %cl, %cl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: retl
+ %cond = icmp ne i32 %secret_bit, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %key1, i32 %key2)
+ ret i32 %result
+}
+
+; Test that no conditional branches appear in constant-time path
+define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) nounwind {
+; I386-NOCMOV-LABEL: test_no_conditional_branches:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %al
+; I386-NOCMOV-NEXT: testb %al, %al
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %eax
+; I386-NOCMOV-NEXT: andl %esi, %eax
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %ecx, %esi
+; I386-NOCMOV-NEXT: orl %esi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_no_conditional_branches:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %cl
+; I386-CMOV-NEXT: testb %cl, %cl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: retl
+ %cond = icmp ne i32 %secret, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2)
+ ret i32 %result
+}
+
+; Test with comparison condition
+define i32 @test_ctselect_i32_cmp(i32 %a, i32 %b, i32 %c) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i32_cmp:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: cmpl %edx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: testb %al, %al
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %eax
+; I386-NOCMOV-NEXT: andl %esi, %eax
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %ecx, %esi
+; I386-NOCMOV-NEXT: orl %esi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_i32_cmp:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %cl
+; I386-CMOV-NEXT: testb %cl, %cl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: retl
+ %cond = icmp eq i32 %a, %c
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c)
+ ret i32 %result
+}
+
+; Test nested selects
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_nested:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %eax, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %dl
+; I386-NOCMOV-NEXT: movb %dl, %dh
+; I386-NOCMOV-NEXT: movzbl %dh, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %ecx, %eax
+; I386-NOCMOV-NEXT: andl %edi, %eax
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %esi, %edi
+; I386-NOCMOV-NEXT: orl %edi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_nested:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel %ecx, %eax
+; I386-CMOV-NEXT: retl
+ %sel1 = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+ %sel2 = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %sel1, i32 %c)
+ ret i32 %sel2
+}
+
+; Declare ct.select intrinsics
+declare i8 @llvm.ct.select.i8(i1, i8, i8)
+declare i16 @llvm.ct.select.i16(i1, i16, i16)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
diff --git a/llvm/test/CodeGen/X86/ctselect-optimization.ll b/llvm/test/CodeGen/X86/ctselect-optimization.ll
new file mode 100644
index 0000000000000..481d49971a937
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-optimization.ll
@@ -0,0 +1,304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s
+
+; Test ct.select optimization patterns
+
+; Test smin(x, 0) pattern optimization
+define i32 @test_ctselect_smin_zero(i32 %x) {
+; CHECK-LABEL: test_ctselect_smin_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+ ret i32 %result
+}
+
+; Test smax(x, 0) pattern optimization
+define i32 @test_ctselect_smax_zero(i32 %x) {
+; CHECK-LABEL: test_ctselect_smax_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: setg %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp sgt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+ ret i32 %result
+}
+
+; Test generic smin pattern
+define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_smin_generic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setl %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+}
+
+; Test generic smax pattern
+define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_smax_generic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setg %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp sgt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+}
+
+; Test umin pattern
+define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_umin_generic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setb %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ult i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+}
+
+; Test umax pattern
+define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_umax_generic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: seta %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ugt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+}
+
+; Test abs pattern
+define i32 @test_ctselect_abs(i32 %x) {
+; CHECK-LABEL: test_ctselect_abs:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: negl %ecx
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %dl
+; CHECK-NEXT: testb %dl, %dl
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %neg = sub i32 0, %x
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x)
+ ret i32 %result
+}
+
+; Test nabs pattern (negative abs)
+define i32 @test_ctselect_nabs(i32 %x) {
+; CHECK-LABEL: test_ctselect_nabs:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: negl %eax
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %neg = sub i32 0, %x
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg)
+ ret i32 %result
+}
+
+; Test sign extension pattern
+define i32 @test_ctselect_sign_extend(i32 %x) {
+; CHECK-LABEL: test_ctselect_sign_extend:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: movl $-1, %ecx
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+ ret i32 %result
+}
+
+; Test zero extension pattern
+define i32 @test_ctselect_zero_extend(i32 %x) {
+; CHECK-LABEL: test_ctselect_zero_extend:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: movl $1, %ecx
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ne i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0)
+ ret i32 %result
+}
+
+; Test mask generation pattern
+define i32 @test_ctselect_mask_generation(i32 %x) {
+; CHECK-LABEL: test_ctselect_mask_generation:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: movl $-1, %ecx
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+ ret i32 %result
+}
+
+; Test constant folding with known condition
+define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_constant_folding_true:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movb $1, %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_constant_folding_false:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test with identical operands
+define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+; CHECK-LABEL: test_ctselect_identical_operands:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: cmovnel %esi, %eax
+; CHECK-NEXT: retq
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x)
+ ret i32 %result
+}
+
+; Test with inverted condition
+define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_inverted_condition:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: sete %dl
+; CHECK-NEXT: testb %dl, %dl
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp eq i32 %x, %y
+ %not_cmp = xor i1 %cmp, true
+ %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test for 64-bit specific optimizations
+define i64 @test_ctselect_i64_smin_zero(i64 %x) {
+; CHECK-LABEL: test_ctselect_i64_smin_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testq %rdi, %rdi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovneq %rdi, %rax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i64 %x, 0
+ %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0)
+ ret i64 %result
+}
+
+; Test for floating point optimizations
+define float @test_ctselect_f32_zero_positive(float %x) {
+; CHECK-LABEL: test_ctselect_f32_zero_positive:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
+; CHECK-NEXT: seta %cl
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %eax, %edx
+; CHECK-NEXT: movd %edx, %xmm0
+; CHECK-NEXT: retq
+ %cmp = fcmp ogt float %x, 0.0
+ %result = call float @llvm.ct.select.f32(i1 %cmp, float %x, float 0.0)
+ ret float %result
+}
+
+define double @test_ctselect_f64_zero_positive(double %x) {
+; CHECK-LABEL: test_ctselect_f64_zero_positive:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %xmm0, %rax
+; CHECK-NEXT: xorpd %xmm1, %xmm1
+; CHECK-NEXT: ucomisd %xmm1, %xmm0
+; CHECK-NEXT: seta %cl
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovneq %rax, %rdx
+; CHECK-NEXT: movq %rdx, %xmm0
+; CHECK-NEXT: retq
+ %cmp = fcmp ogt double %x, 0.0
+ %result = call double @llvm.ct.select.f64(i1 %cmp, double %x, double 0.0)
+ ret double %result
+}
+
+; Test chain of ct.select operations
+define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: test_ctselect_chain:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: cmovnel %ecx, %r8d
+; CHECK-NEXT: testb $1, %sil
+; CHECK-NEXT: cmovnel %r8d, %r9d
+; CHECK-NEXT: testb $1, %dl
+; CHECK-NEXT: cmovnel %r9d, %eax
+; CHECK-NEXT: retq
+ %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+ %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+ %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+ ret i32 %sel3
+}
+
+; Declare the intrinsics
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll
new file mode 100644
index 0000000000000..2206e32cd6d34
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-vector.ll
@@ -0,0 +1,1274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+
+; Test ct.select functionality for vector types
+
+; 128-bit vectors
+define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB0_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %xmm0, %xmm1
+; AVX512-NEXT: .LBB0_2:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %result
+}
+
+define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: test_ctselect_v4f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movaps %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB1_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %xmm0, %xmm1
+; AVX512-NEXT: .LBB1_2:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+ ret <4 x float> %result
+}
+
+define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v2i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v2i64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB2_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %xmm0, %xmm1
+; AVX512-NEXT: .LBB2_2:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b)
+ ret <2 x i64> %result
+}
+
+define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
+; SSE2-LABEL: test_ctselect_v2f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movapd %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB3_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovapd %xmm0, %xmm1
+; AVX512-NEXT: .LBB3_2:
+; AVX512-NEXT: vmovapd %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b)
+ ret <2 x double> %result
+}
+
+; 256-bit vectors
+define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v8i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v8i32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm3, %ymm3
+; AVX-NEXT: vmovd %eax, %ymm3
+; AVX-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm3, %ymm2
+; AVX-NEXT: pand %ymm0, %ymm3
+; AVX-NEXT: pandn %ymm1, %ymm2
+; AVX-NEXT: por %ymm3, %ymm2
+; AVX-NEXT: vmovaps %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm3, %ymm3
+; AVX2-NEXT: vmovd %eax, %ymm3
+; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm3, %ymm2
+; AVX2-NEXT: pand %ymm0, %ymm3
+; AVX2-NEXT: pandn %ymm1, %ymm2
+; AVX2-NEXT: por %ymm3, %ymm2
+; AVX2-NEXT: vmovaps %ymm2, %ymm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB4_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %ymm0, %ymm1
+; AVX512-NEXT: .LBB4_2:
+; AVX512-NEXT: vmovaps %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b)
+ ret <8 x i32> %result
+}
+
+define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: test_ctselect_v8f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: movaps %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v8f32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm3, %ymm3
+; AVX-NEXT: vmovd %eax, %ymm3
+; AVX-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm3, %ymm2
+; AVX-NEXT: pand %ymm0, %ymm3
+; AVX-NEXT: pandn %ymm1, %ymm2
+; AVX-NEXT: por %ymm3, %ymm2
+; AVX-NEXT: vmovaps %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v8f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm3, %ymm3
+; AVX2-NEXT: vmovd %eax, %ymm3
+; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm3, %ymm2
+; AVX2-NEXT: pand %ymm0, %ymm3
+; AVX2-NEXT: pandn %ymm1, %ymm2
+; AVX2-NEXT: por %ymm3, %ymm2
+; AVX2-NEXT: vmovaps %ymm2, %ymm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB5_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %ymm0, %ymm1
+; AVX512-NEXT: .LBB5_2:
+; AVX512-NEXT: vmovaps %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %result = call <8 x float> @llvm.ct.select.v8f32(i1 %cond, <8 x float> %a, <8 x float> %b)
+ ret <8 x float> %result
+}
+
+define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v4i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm3, %ymm3
+; AVX-NEXT: vmovd %eax, %ymm3
+; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm3, %ymm2
+; AVX-NEXT: pand %ymm0, %ymm3
+; AVX-NEXT: pandn %ymm1, %ymm2
+; AVX-NEXT: por %ymm3, %ymm2
+; AVX-NEXT: vmovaps %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm3, %ymm3
+; AVX2-NEXT: vmovd %eax, %ymm3
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm3, %ymm2
+; AVX2-NEXT: pand %ymm0, %ymm3
+; AVX2-NEXT: pandn %ymm1, %ymm2
+; AVX2-NEXT: por %ymm3, %ymm2
+; AVX2-NEXT: vmovaps %ymm2, %ymm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB6_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %ymm0, %ymm1
+; AVX512-NEXT: .LBB6_2:
+; AVX512-NEXT: vmovaps %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %result = call <4 x i64> @llvm.ct.select.v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b)
+ ret <4 x i64> %result
+}
+
+define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: test_ctselect_v4f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: movapd %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4f64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm3, %ymm3
+; AVX-NEXT: vmovd %eax, %ymm3
+; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm3, %ymm2
+; AVX-NEXT: pand %ymm0, %ymm3
+; AVX-NEXT: pandn %ymm1, %ymm2
+; AVX-NEXT: por %ymm3, %ymm2
+; AVX-NEXT: vmovaps %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm3, %ymm3
+; AVX2-NEXT: vmovd %eax, %ymm3
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm3, %ymm2
+; AVX2-NEXT: pand %ymm0, %ymm3
+; AVX2-NEXT: pandn %ymm1, %ymm2
+; AVX2-NEXT: por %ymm3, %ymm2
+; AVX2-NEXT: vmovaps %ymm2, %ymm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB7_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovapd %ymm0, %ymm1
+; AVX512-NEXT: .LBB7_2:
+; AVX512-NEXT: vmovapd %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %result = call <4 x double> @llvm.ct.select.v4f64(i1 %cond, <4 x double> %a, <4 x double> %b)
+ ret <4 x double> %result
+}
+
+; 512-bit vectors (AVX512 only)
+define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v16i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm9, %xmm8
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm8, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm2
+; SSE2-NEXT: movaps %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v16i32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm5, %ymm5
+; AVX-NEXT: vmovd %eax, %ymm5
+; AVX-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm5, %ymm4
+; AVX-NEXT: pand %ymm0, %ymm5
+; AVX-NEXT: pandn %ymm2, %ymm4
+; AVX-NEXT: por %ymm5, %ymm4
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm0, %ymm0
+; AVX-NEXT: vmovd %eax, %ymm0
+; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm0, %ymm2
+; AVX-NEXT: pand %ymm1, %ymm0
+; AVX-NEXT: pandn %ymm3, %ymm2
+; AVX-NEXT: por %ymm0, %ymm2
+; AVX-NEXT: vmovaps %ymm4, %ymm0
+; AVX-NEXT: vmovaps %ymm2, %ymm1
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v16i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm5, %ymm5
+; AVX2-NEXT: vmovd %eax, %ymm5
+; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: pand %ymm0, %ymm5
+; AVX2-NEXT: pandn %ymm2, %ymm4
+; AVX2-NEXT: por %ymm5, %ymm4
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm0, %ymm0
+; AVX2-NEXT: vmovd %eax, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-NEXT: pand %ymm1, %ymm0
+; AVX2-NEXT: pandn %ymm3, %ymm2
+; AVX2-NEXT: por %ymm0, %ymm2
+; AVX2-NEXT: vmovaps %ymm4, %ymm0
+; AVX2-NEXT: vmovaps %ymm2, %ymm1
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v16i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB8_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %zmm0, %zmm1
+; AVX512-NEXT: .LBB8_2:
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %result = call <16 x i32> @llvm.ct.select.v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b)
+ ret <16 x i32> %result
+}
+
+define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) {
+; SSE2-LABEL: test_ctselect_v16f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT: movaps %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm9, %xmm8
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm8, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm2
+; SSE2-NEXT: movaps %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v16f32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm5, %ymm5
+; AVX-NEXT: vmovd %eax, %ymm5
+; AVX-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm5, %ymm4
+; AVX-NEXT: pand %ymm0, %ymm5
+; AVX-NEXT: pandn %ymm2, %ymm4
+; AVX-NEXT: por %ymm5, %ymm4
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm0, %ymm0
+; AVX-NEXT: vmovd %eax, %ymm0
+; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm0, %ymm2
+; AVX-NEXT: pand %ymm1, %ymm0
+; AVX-NEXT: pandn %ymm3, %ymm2
+; AVX-NEXT: por %ymm0, %ymm2
+; AVX-NEXT: vmovaps %ymm4, %ymm0
+; AVX-NEXT: vmovaps %ymm2, %ymm1
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v16f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm5, %ymm5
+; AVX2-NEXT: vmovd %eax, %ymm5
+; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: pand %ymm0, %ymm5
+; AVX2-NEXT: pandn %ymm2, %ymm4
+; AVX2-NEXT: por %ymm5, %ymm4
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm0, %ymm0
+; AVX2-NEXT: vmovd %eax, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-NEXT: pand %ymm1, %ymm0
+; AVX2-NEXT: pandn %ymm3, %ymm2
+; AVX2-NEXT: por %ymm0, %ymm2
+; AVX2-NEXT: vmovaps %ymm4, %ymm0
+; AVX2-NEXT: vmovaps %ymm2, %ymm1
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v16f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB9_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %zmm0, %zmm1
+; AVX512-NEXT: .LBB9_2:
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %result = call <16 x float> @llvm.ct.select.v16f32(i1 %cond, <16 x float> %a, <16 x float> %b)
+ ret <16 x float> %result
+}
+
+define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v8i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm9, %xmm8
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm8, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm2
+; SSE2-NEXT: movaps %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v8i64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm5, %ymm5
+; AVX-NEXT: vmovd %eax, %ymm5
+; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm5, %ymm4
+; AVX-NEXT: pand %ymm0, %ymm5
+; AVX-NEXT: pandn %ymm2, %ymm4
+; AVX-NEXT: por %ymm5, %ymm4
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm0, %ymm0
+; AVX-NEXT: vmovd %eax, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm0, %ymm2
+; AVX-NEXT: pand %ymm1, %ymm0
+; AVX-NEXT: pandn %ymm3, %ymm2
+; AVX-NEXT: por %ymm0, %ymm2
+; AVX-NEXT: vmovaps %ymm4, %ymm0
+; AVX-NEXT: vmovaps %ymm2, %ymm1
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v8i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm5, %ymm5
+; AVX2-NEXT: vmovd %eax, %ymm5
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: pand %ymm0, %ymm5
+; AVX2-NEXT: pandn %ymm2, %ymm4
+; AVX2-NEXT: por %ymm5, %ymm4
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm0, %ymm0
+; AVX2-NEXT: vmovd %eax, %ymm0
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-NEXT: pand %ymm1, %ymm0
+; AVX2-NEXT: pandn %ymm3, %ymm2
+; AVX2-NEXT: por %ymm0, %ymm2
+; AVX2-NEXT: vmovaps %ymm4, %ymm0
+; AVX2-NEXT: vmovaps %ymm2, %ymm1
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v8i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB10_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %zmm0, %zmm1
+; AVX512-NEXT: .LBB10_2:
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %result = call <8 x i64> @llvm.ct.select.v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b)
+ ret <8 x i64> %result
+}
+
+define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) {
+; SSE2-LABEL: test_ctselect_v8f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT: movapd %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm9, %xmm8
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movapd %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movapd %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movapd %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm8, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm2
+; SSE2-NEXT: movaps %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v8f64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm5, %ymm5
+; AVX-NEXT: vmovd %eax, %ymm5
+; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm5, %ymm4
+; AVX-NEXT: pand %ymm0, %ymm5
+; AVX-NEXT: pandn %ymm2, %ymm4
+; AVX-NEXT: por %ymm5, %ymm4
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm0, %ymm0
+; AVX-NEXT: vmovd %eax, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm0, %ymm2
+; AVX-NEXT: pand %ymm1, %ymm0
+; AVX-NEXT: pandn %ymm3, %ymm2
+; AVX-NEXT: por %ymm0, %ymm2
+; AVX-NEXT: vmovaps %ymm4, %ymm0
+; AVX-NEXT: vmovaps %ymm2, %ymm1
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v8f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm5, %ymm5
+; AVX2-NEXT: vmovd %eax, %ymm5
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: pand %ymm0, %ymm5
+; AVX2-NEXT: pandn %ymm2, %ymm4
+; AVX2-NEXT: por %ymm5, %ymm4
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm0, %ymm0
+; AVX2-NEXT: vmovd %eax, %ymm0
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-NEXT: pand %ymm1, %ymm0
+; AVX2-NEXT: pandn %ymm3, %ymm2
+; AVX2-NEXT: por %ymm0, %ymm2
+; AVX2-NEXT: vmovaps %ymm4, %ymm0
+; AVX2-NEXT: vmovaps %ymm2, %ymm1
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v8f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB11_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovapd %zmm0, %zmm1
+; AVX512-NEXT: .LBB11_2:
+; AVX512-NEXT: vmovapd %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %result = call <8 x double> @llvm.ct.select.v8f64(i1 %cond, <8 x double> %a, <8 x double> %b)
+ ret <8 x double> %result
+}
+
+; Test with constant conditions for vector types
+define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_const_true:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movb $1, %al
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i32_const_true:
+; AVX: # %bb.0:
+; AVX-NEXT: movb $1, %al
+; AVX-NEXT: testb %al, %al
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_const_true:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movb $1, %al
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i32_const_true:
+; AVX512: # %bb.0:
+; AVX512-NEXT: retq
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_const_false:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i32_const_false:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: testb %al, %al
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_const_false:
+; AVX2: # %bb.0:
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i32_const_false:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 false, <4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %result
+}
+
+; Test with comparison conditions for vector types
+define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_icmp:
+; SSE2: # %bb.0:
+; SSE2-NEXT: cmpl %esi, %edi
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i32_icmp:
+; AVX: # %bb.0:
+; AVX-NEXT: cmpl %esi, %edi
+; AVX-NEXT: sete %al
+; AVX-NEXT: testb %al, %al
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_icmp:
+; AVX2: # %bb.0:
+; AVX2-NEXT: cmpl %esi, %edi
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i32_icmp:
+; AVX512: # %bb.0:
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: je .LBB14_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: .LBB14_2:
+; AVX512-NEXT: retq
+ %cond = icmp eq i32 %x, %y
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %result
+}
+
+; Declare the intrinsics
+declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
+declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
+declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
+declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
+declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.ct.select.v8f32(i1, <8 x float>, <8 x float>)
+declare <4 x i64> @llvm.ct.select.v4i64(i1, <4 x i64>, <4 x i64>)
+declare <4 x double> @llvm.ct.select.v4f64(i1, <4 x double>, <4 x double>)
+declare <16 x i32> @llvm.ct.select.v16i32(i1, <16 x i32>, <16 x i32>)
+declare <16 x float> @llvm.ct.select.v16f32(i1, <16 x float>, <16 x float>)
+declare <8 x i64> @llvm.ct.select.v8i64(i1, <8 x i64>, <8 x i64>)
+declare <8 x double> @llvm.ct.select.v8f64(i1, <8 x double>, <8 x double>)
diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
index 095787a5e2a4b..d76ae0365f28c 100644
--- a/llvm/test/CodeGen/X86/ctselect.ll
+++ b/llvm/test/CodeGen/X86/ctselect.ll
@@ -8,39 +8,33 @@
define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
; X64-LABEL: test_ctselect_i8:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andb $1, %dil
-; X64-NEXT: leal -1(%rdi), %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: negb %cl
-; X64-NEXT: andb %sil, %cl
-; X64-NEXT: andb %dl, %al
-; X64-NEXT: orb %cl, %al
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_i8:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andb $1, %al
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negb %cl
-; X32-NEXT: andb {{[0-9]+}}(%esp), %cl
-; X32-NEXT: decb %al
-; X32-NEXT: andb {{[0-9]+}}(%esp), %al
-; X32-NEXT: orb %cl, %al
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: # kill: def $al killed $al killed $eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_i8:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andb $1, %al
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negb %cl
-; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %cl
-; X32-NOCMOV-NEXT: decb %al
-; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT: orb %cl, %al
+; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %ah
+; X32-NOCMOV-NEXT: movb %ah, %ch
+; X32-NOCMOV-NEXT: negb %ch
+; X32-NOCMOV-NEXT: movb %dl, %al
+; X32-NOCMOV-NEXT: andb %ch, %al
+; X32-NOCMOV-NEXT: notb %ch
+; X32-NOCMOV-NEXT: andb %cl, %ch
+; X32-NOCMOV-NEXT: orb %ch, %al
; X32-NOCMOV-NEXT: retl
%result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
ret i8 %result
@@ -49,39 +43,43 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
; X64-LABEL: test_ctselect_i16:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: leal -1(%rdi), %ecx
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: negl %eax
-; X64-NEXT: andl %esi, %eax
-; X64-NEXT: andl %edx, %ecx
-; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_i16:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: leal -1(%eax), %ecx
-; X32-NEXT: andw {{[0-9]+}}(%esp), %cx
-; X32-NEXT: negl %eax
-; X32-NEXT: andw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: orl %ecx, %eax
-; X32-NEXT: # kill: def $ax killed $ax killed $eax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnew {{[0-9]+}}(%esp), %ax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_i16:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: leal -1(%eax), %ecx
-; X32-NOCMOV-NEXT: andw {{[0-9]+}}(%esp), %cx
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: andw {{[0-9]+}}(%esp), %ax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
-; X32-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbw %bh, %si
+; X32-NOCMOV-NEXT: negw %si
+; X32-NOCMOV-NEXT: movw %dx, %ax
+; X32-NOCMOV-NEXT: andw %si, %ax
+; X32-NOCMOV-NEXT: notw %si
+; X32-NOCMOV-NEXT: andw %cx, %si
+; X32-NOCMOV-NEXT: orw %si, %ax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
ret i16 %result
@@ -90,38 +88,42 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_i32:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: leal -1(%rdi), %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: negl %ecx
-; X64-NEXT: andl %esi, %ecx
-; X64-NEXT: andl %edx, %eax
-; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_i32:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_i32:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
ret i32 %result
@@ -130,56 +132,66 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
; X64-LABEL: test_ctselect_i64:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: leaq -1(%rdi), %rax
-; X64-NEXT: negq %rdi
-; X64-NEXT: andq %rsi, %rdi
-; X64-NEXT: andq %rdx, %rax
-; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_i64:
; X32: # %bb.0:
-; X32-NEXT: pushl %esi
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .cfi_offset %esi, -8
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: xorl %edx, %eax
-; X32-NEXT: andl $1, %esi
-; X32-NEXT: negl %esi
-; X32-NEXT: andl %esi, %eax
-; X32-NEXT: xorl %edx, %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: andl %esi, %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: popl %esi
-; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_i64:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: pushl %ebp
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: .cfi_offset %esi, -8
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: pushl %edi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: xorl %edx, %eax
-; X32-NOCMOV-NEXT: andl $1, %esi
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT: testb $1, %bl
+; X32-NOCMOV-NEXT: sete %bh
+; X32-NOCMOV-NEXT: movb %bh, %cl
+; X32-NOCMOV-NEXT: movzbl %cl, %esi
; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
; X32-NOCMOV-NEXT: andl %esi, %eax
-; X32-NOCMOV-NEXT: xorl %edx, %eax
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ebp, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: testb $1, %bl
+; X32-NOCMOV-NEXT: sete %cl
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT: movb %cl, %ch
+; X32-NOCMOV-NEXT: movzbl %ch, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edi, %edx
; X32-NOCMOV-NEXT: andl %esi, %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ebx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %edx
; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: popl %edi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebp
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
@@ -189,51 +201,74 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
; X64-LABEL: test_ctselect_f32:
; X64: # %bb.0:
-; X64-NEXT: movd %xmm1, %eax
-; X64-NEXT: movd %xmm0, %ecx
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: movl %edi, %edx
-; X64-NEXT: negl %edx
-; X64-NEXT: andl %ecx, %edx
-; X64-NEXT: decl %edi
-; X64-NEXT: andl %eax, %edi
-; X64-NEXT: orl %edx, %edi
-; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movd %xmm1, %ecx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %eax, %ecx
+; X64-NEXT: movd %ecx, %xmm0
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_f32:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
+; X32-NEXT: pushl %edi
; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
-; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: sete %al
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: flds (%esp)
-; X32-NEXT: popl %eax
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_f32:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %eax
+; X32-NOCMOV-NEXT: pushl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
-; X32-NOCMOV-NEXT: movl %eax, (%esp)
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: pushl %eax
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %al
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movb %al, %ah
+; X32-NOCMOV-NEXT: movzbl %ah, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %ecx, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: movl %esi, (%esp)
; X32-NOCMOV-NEXT: flds (%esp)
-; X32-NOCMOV-NEXT: popl %eax
+; X32-NOCMOV-NEXT: addl $4, %esp
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
@@ -243,74 +278,96 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
; X64-LABEL: test_ctselect_f64:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: movq %xmm1, %rax
-; X64-NEXT: movq %xmm0, %rcx
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: movq %rdi, %rdx
-; X64-NEXT: negq %rdx
-; X64-NEXT: andq %rcx, %rdx
-; X64-NEXT: decq %rdi
-; X64-NEXT: andq %rax, %rdi
-; X64-NEXT: orq %rdx, %rdi
-; X64-NEXT: movq %rdi, %xmm0
+; X64-NEXT: movq %xmm0, %rax
+; X64-NEXT: movq %xmm1, %rcx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rax, %rcx
+; X64-NEXT: movq %rcx, %xmm0
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_f64:
; X32: # %bb.0:
-; X32-NEXT: pushl %esi
+; X32-NEXT: pushl %edi
; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: subl $8, %esp
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: .cfi_offset %esi, -8
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: .cfi_def_cfa_offset 20
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: sete %al
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: xorl %edx, %esi
-; X32-NEXT: andl $1, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl %ecx, %esi
-; X32-NEXT: xorl %edx, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %eax, %edx
-; X32-NEXT: andl %ecx, %edx
-; X32-NEXT: xorl %eax, %edx
-; X32-NEXT: movl %edx, (%esp)
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X32-NEXT: fldl (%esp)
; X32-NEXT: addl $8, %esp
-; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_f64:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: pushl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
; X32-NOCMOV-NEXT: subl $8, %esp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT: .cfi_offset %esi, -8
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %al
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: xorl %edx, %esi
-; X32-NOCMOV-NEXT: andl $1, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl %ecx, %esi
-; X32-NOCMOV-NEXT: xorl %edx, %esi
-; X32-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: movb %al, %ah
+; X32-NOCMOV-NEXT: movzbl %ah, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %ecx, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: movl %esi, (%esp)
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %eax, %edx
-; X32-NOCMOV-NEXT: andl %ecx, %edx
-; X32-NOCMOV-NEXT: xorl %eax, %edx
-; X32-NOCMOV-NEXT: movl %edx, (%esp)
+; X32-NOCMOV-NEXT: movb %al, %ah
+; X32-NOCMOV-NEXT: movzbl %ah, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %ecx, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X32-NOCMOV-NEXT: fldl (%esp)
; X32-NOCMOV-NEXT: addl $8, %esp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
@@ -320,37 +377,42 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
; X64-LABEL: test_ctselect_ptr:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: leaq -1(%rdi), %rax
-; X64-NEXT: negq %rdi
-; X64-NEXT: andq %rsi, %rdi
-; X64-NEXT: andq %rdx, %rax
-; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_ptr:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_ptr:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
ret ptr %result
@@ -360,17 +422,45 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_const_true:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: movb $1, %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_const_true:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movb $1, %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_const_true:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movb $1, %al
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
ret i32 %result
@@ -380,18 +470,44 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_const_false:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %eax
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_const_false:
; X32: # %bb.0:
-; X32-NEXT: xorl %eax, %eax
-; X32-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_const_false:
; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NOCMOV-NEXT: xorl %eax, %eax
-; X32-NOCMOV-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
ret i32 %result
@@ -401,43 +517,50 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_icmp_eq:
; X64: # %bb.0:
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cmpl %esi, %edi
-; X64-NEXT: sete %al
-; X64-NEXT: movl %eax, %esi
-; X64-NEXT: negl %esi
-; X64-NEXT: andl %edx, %esi
-; X64-NEXT: decl %eax
-; X64-NEXT: andl %ecx, %eax
-; X64-NEXT: orl %esi, %eax
+; X64-NEXT: sete %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_icmp_eq:
; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: xorl %eax, %eax
; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: sete %al
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: sete %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_icmp_eq:
; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %eax
-; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X32-NOCMOV-NEXT: sete %al
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%cond = icmp eq i32 %x, %y
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -447,43 +570,50 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_icmp_ne:
; X64: # %bb.0:
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cmpl %esi, %edi
-; X64-NEXT: setne %al
-; X64-NEXT: movl %eax, %esi
-; X64-NEXT: negl %esi
-; X64-NEXT: andl %edx, %esi
-; X64-NEXT: decl %eax
-; X64-NEXT: andl %ecx, %eax
-; X64-NEXT: orl %esi, %eax
+; X64-NEXT: setne %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_icmp_ne:
; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: xorl %eax, %eax
; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: setne %al
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: setne %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_icmp_ne:
; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %eax
-; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X32-NOCMOV-NEXT: setne %al
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%cond = icmp ne i32 %x, %y
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -493,43 +623,50 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_icmp_slt:
; X64: # %bb.0:
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cmpl %esi, %edi
-; X64-NEXT: setl %al
-; X64-NEXT: movl %eax, %esi
-; X64-NEXT: negl %esi
-; X64-NEXT: andl %edx, %esi
-; X64-NEXT: decl %eax
-; X64-NEXT: andl %ecx, %eax
-; X64-NEXT: orl %esi, %eax
+; X64-NEXT: setl %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_icmp_slt:
; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: xorl %eax, %eax
; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: setl %al
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: setl %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_icmp_slt:
; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %eax
-; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X32-NOCMOV-NEXT: setl %al
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%cond = icmp slt i32 %x, %y
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -539,39 +676,50 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_icmp_ult:
; X64: # %bb.0:
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cmpl %esi, %edi
-; X64-NEXT: sbbl %eax, %eax
-; X64-NEXT: andl %eax, %edx
-; X64-NEXT: notl %eax
-; X64-NEXT: andl %ecx, %eax
-; X64-NEXT: orl %edx, %eax
+; X64-NEXT: setb %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_icmp_ult:
; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: xorl %eax, %eax
; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: sbbl %eax, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: andl %eax, %ecx
-; X32-NEXT: notl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: setb %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_icmp_ult:
; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %eax
-; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: sbbl %eax, %eax
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: andl %eax, %ecx
-; X32-NOCMOV-NEXT: notl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: setb %al
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%cond = icmp ult i32 %x, %y
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -581,45 +729,64 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
; X64-LABEL: test_ctselect_fcmp_oeq:
; X64: # %bb.0:
-; X64-NEXT: movd %xmm3, %eax
-; X64-NEXT: cmpeqss %xmm1, %xmm0
-; X64-NEXT: movd %xmm0, %ecx
-; X64-NEXT: pand %xmm2, %xmm0
-; X64-NEXT: movd %xmm0, %edx
-; X64-NEXT: notl %ecx
-; X64-NEXT: andl %eax, %ecx
-; X64-NEXT: orl %edx, %ecx
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: movd %xmm3, %ecx
+; X64-NEXT: ucomiss %xmm1, %xmm0
+; X64-NEXT: setnp %dl
+; X64-NEXT: sete %sil
+; X64-NEXT: testb %dl, %sil
+; X64-NEXT: cmovnel %eax, %ecx
; X64-NEXT: movd %ecx, %xmm0
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_fcmp_oeq:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
+; X32-NEXT: pushl %edi
; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
; X32-NEXT: flds {{[0-9]+}}(%esp)
; X32-NEXT: flds {{[0-9]+}}(%esp)
; X32-NEXT: fucompi %st(1), %st
; X32-NEXT: fstp %st(0)
; X32-NEXT: setnp %al
; X32-NEXT: sete %cl
-; X32-NEXT: andb %al, %cl
-; X32-NEXT: movzbl %cl, %eax
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
-; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: testb %al, %cl
+; X32-NEXT: sete %al
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: flds (%esp)
-; X32-NEXT: popl %eax
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %eax
+; X32-NOCMOV-NEXT: pushl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: pushl %eax
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -8
; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
; X32-NOCMOV-NEXT: fucompp
@@ -628,17 +795,25 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
; X32-NOCMOV-NEXT: sahf
; X32-NOCMOV-NEXT: setnp %al
; X32-NOCMOV-NEXT: sete %cl
-; X32-NOCMOV-NEXT: andb %al, %cl
-; X32-NOCMOV-NEXT: movzbl %cl, %eax
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
-; X32-NOCMOV-NEXT: movl %eax, (%esp)
+; X32-NOCMOV-NEXT: testb %al, %cl
+; X32-NOCMOV-NEXT: sete %al
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movb %al, %ah
+; X32-NOCMOV-NEXT: movzbl %ah, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %ecx, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: movl %esi, (%esp)
; X32-NOCMOV-NEXT: flds (%esp)
-; X32-NOCMOV-NEXT: popl %eax
+; X32-NOCMOV-NEXT: addl $4, %esp
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%cond = fcmp oeq float %x, %y
@@ -650,51 +825,45 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
; X64-LABEL: test_ctselect_load:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: leal -1(%rdi), %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: negl %ecx
-; X64-NEXT: andl (%rsi), %ecx
-; X64-NEXT: andl (%rdx), %eax
-; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel (%rsi), %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_load:
; X32: # %bb.0:
-; X32-NEXT: pushl %esi
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .cfi_offset %esi, -8
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: negl %esi
-; X32-NEXT: andl (%edx), %esi
-; X32-NEXT: decl %eax
-; X32-NEXT: andl (%ecx), %eax
-; X32-NEXT: orl %esi, %eax
-; X32-NEXT: popl %esi
-; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel (%ecx), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_load:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: pushl %ebx
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: .cfi_offset %esi, -8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: movl %eax, %esi
+; X32-NOCMOV-NEXT: movl (%ecx), %ecx
+; X32-NOCMOV-NEXT: movl (%eax), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
; X32-NOCMOV-NEXT: negl %esi
-; X32-NOCMOV-NEXT: andl (%edx), %esi
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl (%ecx), %eax
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
; X32-NOCMOV-NEXT: orl %esi, %eax
; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%a = load i32, ptr %p1
@@ -707,62 +876,63 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
; X64-LABEL: test_ctselect_nested:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $esi killed $esi def $rsi
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %esi
-; X64-NEXT: leal -1(%rsi), %r9d
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: negl %eax
-; X64-NEXT: andl %edx, %eax
-; X64-NEXT: andl %ecx, %r9d
-; X64-NEXT: orl %eax, %r9d
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: leal -1(%rdi), %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: negl %ecx
-; X64-NEXT: andl %r9d, %ecx
-; X64-NEXT: andl %r8d, %eax
-; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: testb $1, %sil
+; X64-NEXT: cmovnel %edx, %ecx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %ecx, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_nested:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: andl $1, %ecx
-; X32-NEXT: movl %ecx, %edx
-; X32-NEXT: negl %edx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: decl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: orl %edx, %ecx
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: negl %edx
-; X32-NEXT: andl %ecx, %edx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %edx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %ecx, %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_nested:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: andl $1, %ecx
-; X32-NOCMOV-NEXT: movl %ecx, %edx
-; X32-NOCMOV-NEXT: negl %edx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: decl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: orl %edx, %ecx
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: movl %eax, %edx
-; X32-NOCMOV-NEXT: negl %edx
-; X32-NOCMOV-NEXT: andl %ecx, %edx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %edx, %eax
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %edi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -16
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %eax, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %dl
+; X32-NOCMOV-NEXT: movb %dl, %dh
+; X32-NOCMOV-NEXT: movzbl %dh, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %ecx, %eax
+; X32-NOCMOV-NEXT: andl %edi, %eax
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %esi, %edi
+; X32-NOCMOV-NEXT: orl %edi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: popl %edi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
%result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
>From d0a67b041f35d4a1df0d4f725957fe995908a48c Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 23:56:12 -0500
Subject: [PATCH 2/2] [LLVM][X86] Add f80 support for ct.select
Add special handling for x86_fp80 types in CTSELECT lowering by splitting
them into three 32-bit chunks, performing constant-time selection on each
chunk, and reassembling the result. This fixes crashes when compiling
tests with f80 types.
Also updated ctselect.ll to match current generic fallback implementation.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 63 +++++
llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 272 ++++++++++------------
2 files changed, 189 insertions(+), 146 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 833afa717c32c..69340cbdde344 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -25533,6 +25533,69 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBitcast(VT, CtSelect);
}
+ // Handle f80 types by splitting into three 32-bit chunks
+ if (VT == MVT::f80) {
+ SDValue Chain = DAG.getEntryNode();
+
+ // Create temporary stack slots for input f80 values
+ SDValue TrueSlot = DAG.CreateStackTemporary(MVT::f80);
+ SDValue FalseSlot = DAG.CreateStackTemporary(MVT::f80);
+
+ // Store f80 values to memory
+ SDValue StoreTrueF80 =
+ DAG.getStore(Chain, DL, TrueOp, TrueSlot, MachinePointerInfo());
+ SDValue StoreFalseF80 =
+ DAG.getStore(Chain, DL, FalseOp, FalseSlot, MachinePointerInfo());
+
+ // Load i32 parts from memory (3 chunks for 96-bit f80 storage)
+ SDValue TruePart0 =
+ DAG.getLoad(MVT::i32, DL, StoreTrueF80, TrueSlot, MachinePointerInfo());
+ SDValue TruePart1Ptr =
+ DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(4), DL);
+ SDValue TruePart1 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart1Ptr,
+ MachinePointerInfo());
+ SDValue TruePart2Ptr =
+ DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(8), DL);
+ SDValue TruePart2 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart2Ptr,
+ MachinePointerInfo());
+
+ SDValue FalsePart0 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalseSlot,
+ MachinePointerInfo());
+ SDValue FalsePart1Ptr =
+ DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(4), DL);
+ SDValue FalsePart1 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart1Ptr,
+ MachinePointerInfo());
+ SDValue FalsePart2Ptr =
+ DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(8), DL);
+ SDValue FalsePart2 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart2Ptr,
+ MachinePointerInfo());
+
+ // Perform CTSELECT on each 32-bit chunk
+ SDValue Part0Ops[] = {FalsePart0, TruePart0, CC, ProcessedCond};
+ SDValue Part0Select = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Part0Ops);
+ SDValue Part1Ops[] = {FalsePart1, TruePart1, CC, ProcessedCond};
+ SDValue Part1Select = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Part1Ops);
+ SDValue Part2Ops[] = {FalsePart2, TruePart2, CC, ProcessedCond};
+ SDValue Part2Select = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Part2Ops);
+
+ // Create result stack slot and store the selected parts
+ SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f80);
+ SDValue StorePart0 =
+ DAG.getStore(Chain, DL, Part0Select, ResultSlot, MachinePointerInfo());
+ SDValue ResPart1Ptr =
+ DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL);
+ SDValue StorePart1 = DAG.getStore(StorePart0, DL, Part1Select, ResPart1Ptr,
+ MachinePointerInfo());
+ SDValue ResPart2Ptr =
+ DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(8), DL);
+ SDValue StorePart2 = DAG.getStore(StorePart1, DL, Part2Select, ResPart2Ptr,
+ MachinePointerInfo());
+
+ // Load complete f80 result from memory
+ return DAG.getLoad(MVT::f80, DL, StorePart2, ResultSlot,
+ MachinePointerInfo());
+ }
+
// Create final CTSELECT node
SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
return DAG.getNode(X86ISD::CTSELECT, DL, Op.getValueType(), Ops,
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
index ea943307c644f..eec38fa581c6f 100644
--- a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
+++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
@@ -209,94 +209,84 @@ define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind
define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
; I386-NOCMOV-LABEL: test_ctselect_f80_basic:
; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: pushl %ebx
; I386-NOCMOV-NEXT: pushl %edi
; I386-NOCMOV-NEXT: pushl %esi
-; I386-NOCMOV-NEXT: subl $12, %esp
-; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: sete %al
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: subl $40, %esp
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: testb $1, %cl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
-; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
-; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, (%esp)
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: sete %ch
+; I386-NOCMOV-NEXT: movb %ch, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edi, %ebx
+; I386-NOCMOV-NEXT: andl %ebp, %ebx
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %edx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %ebx
+; I386-NOCMOV-NEXT: testb $1, %cl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
-; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
-; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: sete %ch
+; I386-NOCMOV-NEXT: movb %ch, %cl
+; I386-NOCMOV-NEXT: movzbl %cl, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edx, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %eax, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl (%esp), %ebx # 4-byte Reload
+; I386-NOCMOV-NEXT: movb %al, %dl
+; I386-NOCMOV-NEXT: movzbl %dl, %edi
; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, %ecx
+; I386-NOCMOV-NEXT: andl %edi, %ecx
; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: fldt (%esp)
-; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: andl %ebx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: addl $40, %esp
; I386-NOCMOV-NEXT: popl %esi
; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: popl %ebp
; I386-NOCMOV-NEXT: retl
;
; I386-CMOV-LABEL: test_ctselect_f80_basic:
; I386-CMOV: # %bb.0:
-; I386-CMOV-NEXT: pushl %edi
-; I386-CMOV-NEXT: pushl %esi
-; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: subl $36, %esp
+; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, (%esp)
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, (%esp)
; I386-CMOV-NEXT: fldt (%esp)
-; I386-CMOV-NEXT: addl $12, %esp
-; I386-CMOV-NEXT: popl %esi
-; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: addl $36, %esp
; I386-CMOV-NEXT: retl
%result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
ret x86_fp80 %result
@@ -543,94 +533,84 @@ define float @test_ctselect_f32_nan(i1 %cond) nounwind {
define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
; I386-NOCMOV-LABEL: test_ctselect_f80_alignment:
; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: pushl %ebx
; I386-NOCMOV-NEXT: pushl %edi
; I386-NOCMOV-NEXT: pushl %esi
-; I386-NOCMOV-NEXT: subl $12, %esp
-; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: sete %al
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: subl $40, %esp
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: testb $1, %cl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
-; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
-; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, (%esp)
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: sete %ch
+; I386-NOCMOV-NEXT: movb %ch, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edi, %ebx
+; I386-NOCMOV-NEXT: andl %ebp, %ebx
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %edx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %ebx
+; I386-NOCMOV-NEXT: testb $1, %cl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
-; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
-; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: sete %ch
+; I386-NOCMOV-NEXT: movb %ch, %cl
+; I386-NOCMOV-NEXT: movzbl %cl, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edx, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %eax, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl (%esp), %ebx # 4-byte Reload
+; I386-NOCMOV-NEXT: movb %al, %dl
+; I386-NOCMOV-NEXT: movzbl %dl, %edi
; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, %ecx
+; I386-NOCMOV-NEXT: andl %edi, %ecx
; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: fldt (%esp)
-; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: andl %ebx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: addl $40, %esp
; I386-NOCMOV-NEXT: popl %esi
; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: popl %ebp
; I386-NOCMOV-NEXT: retl
;
; I386-CMOV-LABEL: test_ctselect_f80_alignment:
; I386-CMOV: # %bb.0:
-; I386-CMOV-NEXT: pushl %edi
-; I386-CMOV-NEXT: pushl %esi
-; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: subl $36, %esp
+; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, (%esp)
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, (%esp)
; I386-CMOV-NEXT: fldt (%esp)
-; I386-CMOV-NEXT: addl $12, %esp
-; I386-CMOV-NEXT: popl %esi
-; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: addl $36, %esp
; I386-CMOV-NEXT: retl
%result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
ret x86_fp80 %result
More information about the llvm-branch-commits
mailing list