[llvm-branch-commits] [llvm] [ConstantTime] Native ct.select support for X86 and i386 (PR #166704)
Julius Alexandre via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sat Mar 7 13:34:29 PST 2026
https://github.com/wizardengineer updated https://github.com/llvm/llvm-project/pull/166704
>From cca6da444d2edc19dc58cd4376db2c82dce6ccc8 Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 17:09:23 -0500
Subject: [PATCH 1/2] [LLVM][X86] Add native ct.select support for X86 and i386
Add native X86 implementation with CMOV instructions and comprehensive tests:
- X86 ISelLowering with CMOV for x86_64 and i386
- Fallback bitwise operations for i386 targets without CMOV
- Post-RA expansion for pseudo-instructions
- Comprehensive test coverage:
- Edge cases (zero conditions, large integers)
- i386-specific tests (FP, MMX, non-CMOV fallback)
- Vector operations
- Optimization patterns
The basic test demonstrating fallback is in the core infrastructure PR.
---
llvm/lib/Target/X86/X86.td | 8 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 791 ++++-
llvm/lib/Target/X86/X86ISelLowering.h | 7 +
llvm/lib/Target/X86/X86InstrCMovSetCC.td | 205 ++
llvm/lib/Target/X86/X86InstrCompiler.td | 81 +
llvm/lib/Target/X86/X86InstrFragments.td | 5 +
llvm/lib/Target/X86/X86InstrInfo.cpp | 609 +++-
llvm/lib/Target/X86/X86InstrInfo.h | 6 +
llvm/lib/Target/X86/X86InstrPredicates.td | 5 +
llvm/lib/Target/X86/X86TargetMachine.cpp | 5 +-
llvm/test/CodeGen/X86/ctselect-edge-cases.ll | 409 +++
llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 722 ++++
llvm/test/CodeGen/X86/ctselect-i386-mmx.ll | 428 +++
llvm/test/CodeGen/X86/ctselect-i386.ll | 267 ++
.../test/CodeGen/X86/ctselect-optimization.ll | 304 ++
llvm/test/CodeGen/X86/ctselect-vector.ll | 1274 +++++++
llvm/test/CodeGen/X86/ctselect.ll | 1825 ++++------
nasty-fix-constant.patch | 2994 +++++++++++++++++
18 files changed, 8700 insertions(+), 1245 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/ctselect-edge-cases.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-fp.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-i386.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-optimization.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-vector.ll
create mode 100644 nasty-fix-constant.patch
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index eca763735c315..755a7070d84d3 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -841,9 +841,10 @@ include "X86SchedSapphireRapids.td"
def ProcessorFeatures {
// x86-64 micro-architecture levels: x86-64 and x86-64-v[234]
- list<SubtargetFeature> X86_64V1Features = [
- FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2,
- FeatureFXSR, FeatureNOPL, FeatureX86_64,
+ list<SubtargetFeature> X86_64V1Features = [FeatureX87, FeatureCX8,
+ FeatureCMOV, FeatureMMX,
+ FeatureSSE2, FeatureFXSR,
+ FeatureNOPL, FeatureX86_64,
];
list<SubtargetFeature> X86_64V1Tuning = [
TuningMacroFusion,
@@ -1179,6 +1180,7 @@ def ProcessorFeatures {
FeatureAVXNECONVERT,
FeatureAVXVNNIINT8,
FeatureAVXVNNIINT16,
+ FeatureUSERMSR,
FeatureSHA512,
FeatureSM3,
FeatureEGPR,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1ebfd5defdc40..401c1953323f4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "X86ISelLowering.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
#include "MCTargetDesc/X86ShuffleDecode.h"
#include "X86.h"
#include "X86FrameLowering.h"
@@ -30,6 +31,8 @@
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -49,6 +52,7 @@
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"
@@ -489,6 +493,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// X86 wants to expand cmov itself.
for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CT_SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
@@ -497,6 +502,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CT_SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
}
@@ -504,6 +510,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Custom action for SELECT MMX and expand action for SELECT_CC MMX
setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::x86mmx, Custom);
setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
@@ -633,6 +640,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BR_CC, VT, Action);
setOperationAction(ISD::SETCC, VT, Action);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CT_SELECT, VT, Custom);
setOperationAction(ISD::SELECT_CC, VT, Action);
setOperationAction(ISD::FROUND, VT, Action);
setOperationAction(ISD::FROUNDEVEN, VT, Action);
@@ -1079,6 +1087,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom);
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
@@ -1247,6 +1256,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v2i64, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v4i32, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v8i16, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v8f16, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v16i8, Custom);
+
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
@@ -1576,6 +1592,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v4f64, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v4i64, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v8i32, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v16i16, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v16f16, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v32i8, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v8f32, Custom);
+
for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
@@ -1775,6 +1799,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
+ setOperationAction(ISD::CT_SELECT, MVT::v1i1, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
@@ -1820,6 +1845,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CT_SELECT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
@@ -2099,6 +2125,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CT_SELECT, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -2295,6 +2322,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CT_SELECT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
@@ -2361,6 +2389,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, VT, Legal);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CT_SELECT, VT, Custom);
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
@@ -2630,6 +2659,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::x86amx, &X86::TILERegClass);
}
+ // Handle 512-bit vector CT_SELECT without AVX512 by setting them to Expand
+ // This allows type legalization to split them into smaller vectors
+ for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16,
+ MVT::v16f32, MVT::v8f64}) {
+ setOperationAction(ISD::CT_SELECT, VT, Expand);
+ }
+
+ // Handle 256-bit vector CT_SELECT without AVX by setting them to Expand
+ // This allows type legalization to split them into 128-bit vectors
+ if (!Subtarget.hasAVX()) {
+ for (auto VT : {MVT::v4f64, MVT::v4i64, MVT::v8i32, MVT::v16i16,
+ MVT::v16f16, MVT::v32i8, MVT::v8f32}) {
+ setOperationAction(ISD::CT_SELECT, VT, Expand);
+ }
+ }
+
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -2736,6 +2781,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::BITCAST,
ISD::VSELECT,
ISD::SELECT,
+ ISD::CT_SELECT,
ISD::SHL,
ISD::SRA,
ISD::SRL,
@@ -25962,6 +26008,174 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,
return V;
}
+SDValue X86TargetLowering::LowerCT_SELECT(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Cond = Op.getOperand(0); // condition
+ SDValue TrueOp = Op.getOperand(1); // true_value
+ SDValue FalseOp = Op.getOperand(2); // false_value
+ SDLoc DL(Op);
+ MVT VT = TrueOp.getSimpleValueType();
+
+ // Special handling for i386 targets (no CMOV) - route to post-RA expansion
+ // pseudos Let standard type legalization handle i64 automatically (splits
+ // into EDX:EAX)
+
+ // Handle soft float16 by converting to integer operations
+ if (isSoftF16(VT, Subtarget)) {
+ MVT NVT = VT.changeTypeToInteger();
+ SDValue CtSelect =
+ DAG.getNode(ISD::CT_SELECT, DL, NVT, Cond, DAG.getBitcast(NVT, FalseOp),
+ DAG.getBitcast(NVT, TrueOp));
+ return DAG.getBitcast(VT, CtSelect);
+ }
+
+ // Handle vector types
+ if (VT.isVector()) {
+ // Handle soft float16 vectors
+ if (isSoftF16(VT, Subtarget)) {
+ MVT NVT = VT.changeVectorElementTypeToInteger();
+ SDValue CtSelect = DAG.getNode(ISD::CT_SELECT, DL, NVT, Cond,
+ DAG.getBitcast(NVT, FalseOp),
+ DAG.getBitcast(NVT, TrueOp));
+ return DAG.getBitcast(VT, CtSelect);
+ }
+
+ unsigned VectorWidth = VT.getSizeInBits();
+ MVT EltVT = VT.getVectorElementType();
+
+ // 512-bit vectors without AVX512 are now handled by type legalization
+ // (Expand action) 256-bit vectors without AVX are now handled by type
+ // legalization (Expand action)
+
+ if (VectorWidth == 128 && !Subtarget.hasSSE1())
+ return SDValue();
+
+ // Handle special cases for floating point vectors
+ if (EltVT.isFloatingPoint()) {
+ // For vector floating point with AVX, use VBLENDV-style operations
+ if (Subtarget.hasAVX() && (VectorWidth == 256 || VectorWidth == 128)) {
+ // Convert to bitwise operations using the condition
+ MVT IntVT = VT.changeVectorElementTypeToInteger();
+ SDValue IntOp1 = DAG.getBitcast(IntVT, TrueOp);
+ SDValue IntOp2 = DAG.getBitcast(IntVT, FalseOp);
+
+ // Create the CT_SELECT node with integer types
+ SDValue IntResult =
+ DAG.getNode(X86ISD::CT_SELECT, DL, IntVT, IntOp2, IntOp1,
+ DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8),
+ EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget));
+ return DAG.getBitcast(VT, IntResult);
+ }
+ }
+
+ // For integer vectors or when we don't have advanced SIMD support,
+ // use the generic X86 CT_SELECT node which will be matched by the patterns
+ SDValue CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+ SDValue EFLAGS = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+ // Create the X86 CT_SELECT node - note operand order: true, false, cc, flags
+ return DAG.getNode(X86ISD::CT_SELECT, DL, VT, FalseOp, TrueOp, CC, EFLAGS);
+ }
+
+ // Look past (and (setcc_carry (cmp ...)), 1)
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+ isOneConstant(Cond.getOperand(1)))
+ Cond = Cond.getOperand(0);
+
+ /// Process condition flags and prepare for CT_SELECT node creation
+ auto ProcessConditionFlags =
+ [&](SDValue Cond, MVT VT, SDLoc DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) -> std::pair<SDValue, SDValue> {
+ SDValue CC;
+ bool AddTest = true;
+
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+ SDValue Cmp = Cond.getOperand(1);
+
+ if ((isX86LogicalCmp(Cmp)) || Cmp.getOpcode() == X86ISD::BT) {
+ Cond = Cmp;
+ AddTest = false;
+ }
+ } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
+ SDValue Value;
+ X86::CondCode X86Cond;
+ std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
+ CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
+ AddTest = false;
+ }
+
+ if (AddTest) {
+ // Look past the truncate if the high bits are known zero
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ // Try to match AND to BT instruction
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ X86::CondCode X86CondCode;
+ if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
+ CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
+ Cond = BT;
+ AddTest = false;
+ }
+ }
+ }
+
+ if (AddTest) {
+ CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+ Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+ }
+
+ return {CC, Cond};
+ };
+
+ // Process condition flags and prepare for CT_SELECT
+ auto [CC, ProcessedCond] =
+ ProcessConditionFlags(Cond, VT, DL, DAG, Subtarget);
+
+ // Handle i8 CT_SELECT with truncate optimization
+ if (Op.getValueType() == MVT::i8 && TrueOp.getOpcode() == ISD::TRUNCATE &&
+ FalseOp.getOpcode() == ISD::TRUNCATE) {
+ SDValue T1 = TrueOp.getOperand(0), T2 = FalseOp.getOperand(0);
+ if (T1.getValueType() == T2.getValueType() &&
+ T1.getOpcode() != ISD::CopyFromReg &&
+ T2.getOpcode() != ISD::CopyFromReg) {
+ SDValue CtSelect = DAG.getNode(X86ISD::CT_SELECT, DL, T1.getValueType(),
+ T2, T1, CC, ProcessedCond);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect);
+ }
+ }
+
+ // Promote small integer types to avoid partial register stalls
+ // Exception: For i8 without CMOV, we can generate a shorter instruction
+ // sequence without movzx so keep it as is.
+ if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMOV()) ||
+ (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(TrueOp, Subtarget) &&
+ !X86::mayFoldLoad(FalseOp, Subtarget))) {
+ TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp);
+ FalseOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseOp);
+ SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+ SDValue CtSelect = DAG.getNode(X86ISD::CT_SELECT, DL, MVT::i32, Ops);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect);
+ }
+
+ if (isScalarFPTypeInSSEReg(VT)) {
+ MVT IntVT = (VT == MVT::f32) ? MVT::i32 : MVT::i64;
+ TrueOp = DAG.getBitcast(IntVT, TrueOp);
+ FalseOp = DAG.getBitcast(IntVT, FalseOp);
+ SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+ SDValue CtSelect = DAG.getNode(X86ISD::CT_SELECT, DL, IntVT, Ops);
+ return DAG.getBitcast(VT, CtSelect);
+ }
+
+ // Create final CT_SELECT node
+ SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+ return DAG.getNode(X86ISD::CT_SELECT, DL, Op.getValueType(), Ops,
+ Op->getFlags());
+}
+
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = Op->getOperand(0);
@@ -30251,30 +30465,65 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
const X86Subtarget &Subtarget,
SelectionDAG &DAG,
SDValue *Low = nullptr) {
+ unsigned NumElts = VT.getVectorNumElements();
+
// For vXi8 we will unpack the low and high half of each 128 bit lane to widen
// to a vXi16 type. Do the multiplies, shift the results and pack the half
// lane results back together.
// We'll take different approaches for signed and unsigned.
- // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
- // words and use pmullw to calculate the full 16-bit product.
+ // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
+ // and use pmullw to calculate the full 16-bit product.
// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
// shift them left into the upper byte of each word. This allows us to use
// pmulhw to calculate the full 16-bit product. This trick means we don't
// need to sign extend the bytes to use pmullw.
- MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+
+ MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
SDValue Zero = DAG.getConstant(0, dl, VT);
- SDValue ALo, AHi, BLo, BHi;
+ SDValue ALo, AHi;
if (IsSigned) {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
- BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
- BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
} else {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
- BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
+ }
+
+ SDValue BLo, BHi;
+ if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
+ // If the RHS is a constant, manually unpackl/unpackh and extend.
+ SmallVector<SDValue, 16> LoOps, HiOps;
+ for (unsigned i = 0; i != NumElts; i += 16) {
+ for (unsigned j = 0; j != 8; ++j) {
+ SDValue LoOp = B.getOperand(i + j);
+ SDValue HiOp = B.getOperand(i + j + 8);
+
+ if (IsSigned) {
+ LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
+ HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
+ LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
+ DAG.getConstant(8, dl, MVT::i16));
+ HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
+ DAG.getConstant(8, dl, MVT::i16));
+ } else {
+ LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
+ HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
+ }
+
+ LoOps.push_back(LoOp);
+ HiOps.push_back(HiOp);
+ }
+ }
+
+ BLo = DAG.getBuildVector(ExVT, dl, LoOps);
+ BHi = DAG.getBuildVector(ExVT, dl, HiOps);
+ } else if (IsSigned) {
+ BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
+ BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
+ } else {
+ BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
}
@@ -30287,7 +30536,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
if (Low)
*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
- return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
+ return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
}
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
@@ -34174,6 +34423,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
+ case ISD::CT_SELECT: return LowerCT_SELECT(Op, DAG);
case ISD::COND_LOOP:
case ISD::BRCOND: return LowerConditionalBranch(Op, DAG);
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
@@ -34258,6 +34508,12 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
}
}
+bool X86TargetLowering::isSelectSupported(SelectSupportKind Kind) const {
+ if (Kind == SelectSupportKind::CtSelect) {
+ return true;
+ }
+ return TargetLoweringBase::isSelectSupported(Kind);
+}
/// Replace a node with an illegal result type with a new node built out of
/// custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
@@ -35797,6 +36053,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(STRICT_CMPM)
NODE_NAME_CASE(CMPMM_SAE)
NODE_NAME_CASE(SETCC)
+ NODE_NAME_CASE(CT_SELECT)
NODE_NAME_CASE(SETCC_CARRY)
NODE_NAME_CASE(FSETCC)
NODE_NAME_CASE(FSETCCM)
@@ -38578,6 +38835,480 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
return BB;
}
+/// Helper function to emit i386 CT_SELECT with condition materialization.
+/// This converts EFLAGS-based CT_SELECT into a condition byte that can be
+/// shared across multiple operations (critical for i64 type legalization).
+///
+/// Phase 1: Materialize condition byte from EFLAGS using SETCC
+/// Phase 2: Create internal pseudo with condition byte for post-RA expansion
+///
+/// This approach ensures that when i64 is type-legalized into two i32
+/// operations, both operations share the same condition byte rather than
+/// each independently reading (and destroying) EFLAGS.
+static MachineBasicBlock *
+emitCTSelectI386WithConditionMaterialization(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned InternalPseudoOpcode) {
+ const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+ const MIMetadata MIMD(MI);
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Original pseudo operands: (outs dst), (ins src1, src2, cond)
+ Register Src1Reg = MI.getOperand(1).getReg();
+ Register Src2Reg = MI.getOperand(2).getReg();
+ X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(3).getImm());
+
+ // Get opposite condition (SETCC sets to 1 when condition is TRUE,
+ // but we want to select src1 when condition is FALSE for X86 semantics)
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+ // Step 1: Materialize condition byte from EFLAGS
+ // This is done OUTSIDE the constant-time bundle, before any EFLAGS corruption
+ Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC);
+
+ // Step 2: Create internal pseudo that takes condition byte as input
+ // This pseudo will be expanded post-RA into the actual constant-time bundle
+ // The condition byte can now be safely shared between multiple pseudos
+
+ // Internal pseudo has operands: (outs dst, tmp_byte, tmp_mask), (ins src1,
+ // src2, cond_byte)
+ Register DstReg = MI.getOperand(0).getReg();
+
+ // Create virtual registers for the temporary outputs
+ Register TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ Register TmpMaskReg;
+
+ // Determine the register class for tmp_mask based on the data type
+ if (InternalPseudoOpcode == X86::CT_SELECT_I386_INT_GR8rr) {
+ TmpMaskReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ } else if (InternalPseudoOpcode == X86::CT_SELECT_I386_INT_GR16rr) {
+ TmpMaskReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+ } else if (InternalPseudoOpcode == X86::CT_SELECT_I386_INT_GR32rr) {
+ TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ } else {
+ llvm_unreachable("Unknown internal pseudo opcode");
+ }
+
+ BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode))
+ .addDef(DstReg) // dst (output)
+ .addDef(TmpByteReg) // tmp_byte (output)
+ .addDef(TmpMaskReg) // tmp_mask (output)
+ .addReg(Src1Reg) // src1 (input)
+ .addReg(Src2Reg) // src2 (input)
+ .addReg(CondByteReg); // pre-materialized condition byte (input)
+
+ MI.eraseFromParent();
+ return BB;
+}
+
+// Helper structure to hold memory operand information for FP loads
+struct FPLoadMemOperands {
+ bool IsValid = false;
+ unsigned BaseReg = 0;
+ int64_t ScaleVal = 1;
+ unsigned IndexReg = 0;
+ int64_t Disp = 0;
+ unsigned SegReg = 0;
+ int FrameIndex = -1;
+ bool IsFrameIndex = false;
+ int ConstantPoolIndex = -1;
+ bool IsConstantPool = false;
+ const GlobalValue *Global = nullptr;
+ int64_t GlobalOffset = 0;
+ bool IsGlobal = false;
+};
+
+// Check if a virtual register is defined by a simple FP load instruction
+// Returns the memory operands if it's a simple load, otherwise returns invalid
+static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
+ MachineRegisterInfo &MRI,
+ unsigned ExpectedLoadOpcode) {
+ FPLoadMemOperands Result;
+
+ if (!Reg.isVirtual())
+ return Result;
+
+ MachineInstr *DefMI = MRI.getVRegDef(Reg);
+ if (!DefMI)
+ return Result;
+
+ // Check if it's the expected load opcode (e.g., LD_Fp32m, LD_Fp64m, LD_Fp80m)
+ if (DefMI->getOpcode() != ExpectedLoadOpcode)
+ return Result;
+
+ // Check that this is a simple load - not volatile, not atomic, etc.
+ // FP loads have hasSideEffects = 0 in their definition for simple loads
+ if (DefMI->hasOrderedMemoryRef())
+ return Result;
+
+ // The load should have a single def (the destination register) and memory operands
+ // Format: %reg = LD_Fpxxm <fi#N>, 1, %noreg, 0, %noreg
+ // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment
+ if (DefMI->getNumOperands() < 6)
+ return Result;
+
+ // Operand 0 is the destination, operands 1-5 are the memory reference
+ MachineOperand &BaseMO = DefMI->getOperand(1);
+ MachineOperand &ScaleMO = DefMI->getOperand(2);
+ MachineOperand &IndexMO = DefMI->getOperand(3);
+ MachineOperand &DispMO = DefMI->getOperand(4);
+ MachineOperand &SegMO = DefMI->getOperand(5);
+
+ // Check if this is a frame index load
+ if (BaseMO.isFI()) {
+ Result.IsValid = true;
+ Result.IsFrameIndex = true;
+ Result.FrameIndex = BaseMO.getIndex();
+ Result.ScaleVal = ScaleMO.getImm();
+ Result.IndexReg = IndexMO.getReg();
+ Result.Disp = DispMO.getImm();
+ Result.SegReg = SegMO.getReg();
+ return Result;
+ }
+
+ // Check if this is a constant pool load
+ // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg
+ if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
+ ScaleMO.isImm() && IndexMO.isReg() &&
+ IndexMO.getReg() == X86::NoRegister &&
+ DispMO.isCPI() && SegMO.isReg()) {
+ Result.IsValid = true;
+ Result.IsConstantPool = true;
+ Result.ConstantPoolIndex = DispMO.getIndex();
+ Result.ScaleVal = ScaleMO.getImm();
+ Result.IndexReg = IndexMO.getReg();
+ Result.Disp = 0;
+ Result.SegReg = SegMO.getReg();
+ return Result;
+ }
+
+ // Check if this is a global variable load
+ // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg
+ if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
+ ScaleMO.isImm() && IndexMO.isReg() &&
+ IndexMO.getReg() == X86::NoRegister &&
+ DispMO.isGlobal() && SegMO.isReg()) {
+ Result.IsValid = true;
+ Result.IsGlobal = true;
+ Result.Global = DispMO.getGlobal();
+ Result.GlobalOffset = DispMO.getOffset();
+ Result.ScaleVal = ScaleMO.getImm();
+ Result.IndexReg = IndexMO.getReg();
+ Result.Disp = 0;
+ Result.SegReg = SegMO.getReg();
+ return Result;
+ }
+
+ // Regular memory operands (e.g., pointer loads)
+ if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() &&
+ DispMO.isImm() && SegMO.isReg()) {
+ Result.IsValid = true;
+ Result.IsFrameIndex = false;
+ Result.IsConstantPool = false;
+ Result.BaseReg = BaseMO.getReg();
+ Result.ScaleVal = ScaleMO.getImm();
+ Result.IndexReg = IndexMO.getReg();
+ Result.Disp = DispMO.getImm();
+ Result.SegReg = SegMO.getReg();
+ return Result;
+ }
+
+ return Result;
+}
+
+static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned pseudoInstr) {
+ const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+ const MIMetadata MIMD(MI);
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ unsigned RegSizeInByte = 4;
+
+ // Get operands
+ // MI operands: %result:rfp80 = CT_SELECT_I386 %false:rfp80, %true:rfp80, %cond:i8imm
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned FalseReg = MI.getOperand(1).getReg();
+ unsigned TrueReg = MI.getOperand(2).getReg();
+ X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(3).getImm());
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+ // Materialize condition byte from EFLAGS
+ Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC);
+
+ auto storeFpToSlot = [&](unsigned Opcode, int Slot, Register Reg) {
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(Opcode)), Slot)
+ .addReg(Reg, RegState::Kill);
+ };
+
+ // Helper to load integer from memory operands
+ auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps,
+ unsigned Offset) -> unsigned {
+ unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg);
+
+ if (MemOps.IsFrameIndex) {
+ // Frame index: addFrameIndex + scale + index + disp + segment
+ MIB.addFrameIndex(MemOps.FrameIndex)
+ .addImm(MemOps.ScaleVal)
+ .addReg(MemOps.IndexReg)
+ .addImm(MemOps.Disp + Offset)
+ .addReg(MemOps.SegReg);
+ } else if (MemOps.IsConstantPool) {
+ // Constant pool: base_reg + scale + index + CP_index + segment
+ // MOV32rm format: base, scale, index, displacement, segment
+ MIB.addReg(X86::NoRegister) // Base register
+ .addImm(MemOps.ScaleVal) // Scale
+ .addReg(MemOps.IndexReg) // Index register
+ .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset) // Displacement (CP index)
+ .addReg(MemOps.SegReg); // Segment
+ } else if (MemOps.IsGlobal) {
+ // Global variable: base_reg + scale + index + global + segment
+ // MOV32rm format: base, scale, index, displacement, segment
+ MIB.addReg(X86::NoRegister) // Base register
+ .addImm(MemOps.ScaleVal) // Scale
+ .addReg(MemOps.IndexReg) // Index register
+ .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset) // Displacement (global address)
+ .addReg(MemOps.SegReg); // Segment
+ } else {
+ // Regular memory: base_reg + scale + index + disp + segment
+ MIB.addReg(MemOps.BaseReg)
+ .addImm(MemOps.ScaleVal)
+ .addReg(MemOps.IndexReg)
+ .addImm(MemOps.Disp + Offset)
+ .addReg(MemOps.SegReg);
+ }
+
+ return IntReg;
+ };
+
+ // Optimized path: load integers directly from memory when both operands are
+ // memory loads, avoiding FP register round-trip
+ auto emitCtSelectFromMemory = [&](unsigned NumValues,
+ const FPLoadMemOperands &TrueMemOps,
+ const FPLoadMemOperands &FalseMemOps,
+ int ResultSlot) {
+ for (unsigned Val = 0; Val < NumValues; ++Val) {
+ unsigned Offset = Val * RegSizeInByte;
+
+ // Load true and false values directly from their memory locations as integers
+ unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset);
+ unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset);
+
+ // Use CT_SELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+ unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+
+ BuildMI(*BB, MI, MIMD, TII->get(X86::CT_SELECT_I386_INT_GR32rr))
+ .addDef(ResultIntReg) // dst (output)
+ .addDef(TmpByteReg) // tmp_byte (output)
+ .addDef(TmpMaskReg) // tmp_mask (output)
+ .addReg(FalseIntReg) // src1 (input) - false value
+ .addReg(TrueIntReg) // src2 (input) - true value
+ .addReg(CondByteReg); // pre-materialized condition byte (input)
+
+ // Store result back to result slot
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+ .addFrameIndex(ResultSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0)
+ .addReg(ResultIntReg, RegState::Kill);
+ }
+ };
+
+ auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) {
+ for (unsigned Val = 0; Val < NumValues; ++Val) {
+ unsigned Offset = Val * RegSizeInByte;
+
+ // Load true and false values from stack as 32-bit integers
+ unsigned TrueIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueIntReg)
+ .addFrameIndex(TrueSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0);
+
+ unsigned FalseIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), FalseIntReg)
+ .addFrameIndex(FalseSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0);
+
+ // Use CT_SELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+ unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+
+ BuildMI(*BB, MI, MIMD, TII->get(X86::CT_SELECT_I386_INT_GR32rr))
+ .addDef(ResultIntReg) // dst (output)
+ .addDef(TmpByteReg) // tmp_byte (output)
+ .addDef(TmpMaskReg) // tmp_mask (output)
+ .addReg(FalseIntReg) // src1 (input) - false value
+ .addReg(TrueIntReg) // src2 (input) - true value
+ .addReg(CondByteReg); // pre-materialized condition byte (input)
+
+ // Store result back to result slot
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+ .addFrameIndex(ResultSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0)
+ .addReg(ResultIntReg, RegState::Kill);
+ }
+ };
+
+ switch (pseudoInstr) {
+ case X86::CT_SELECT_I386_FP32rr: {
+ // Check if both operands are simple memory loads
+ FPLoadMemOperands TrueMemOps =
+ getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp32m);
+ FPLoadMemOperands FalseMemOps =
+ getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp32m);
+
+ int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+
+ if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+ // Optimized path: load directly from memory as integers
+ // Works for both frame index loads (stack parameters) and
+ // constant pool loads (constants)
+ emitCtSelectFromMemory(1, TrueMemOps, FalseMemOps, ResultSlot);
+
+ // Erase the original FP load instructions since we're not using them
+ // and have loaded the data directly as integers instead
+ if (MRI.hasOneUse(TrueReg)) {
+ if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+ TrueDefMI->eraseFromParent();
+ }
+ if (MRI.hasOneUse(FalseReg)) {
+ if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+ FalseDefMI->eraseFromParent();
+ }
+ } else {
+ // General path: spill FP registers to stack first
+ int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+ int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+
+ storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg);
+ storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg);
+
+ emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot);
+ }
+
+ // Load result back as f32
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg),
+ ResultSlot);
+ break;
+ }
+ case X86::CT_SELECT_I386_FP64rr: {
+ unsigned StackSlotSize = 8;
+
+ // Check if both operands are simple memory loads
+ FPLoadMemOperands TrueMemOps =
+ getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp64m);
+ FPLoadMemOperands FalseMemOps =
+ getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp64m);
+
+ int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+
+ if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+ // Optimized path: load directly from memory as integers
+ // Works for both frame index loads (stack parameters) and
+ // constant pool loads (constants)
+ emitCtSelectFromMemory(StackSlotSize / RegSizeInByte, TrueMemOps,
+ FalseMemOps, ResultSlot);
+
+ // Erase the original FP load instructions since we're not using them
+ if (MRI.hasOneUse(TrueReg)) {
+ if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+ TrueDefMI->eraseFromParent();
+ }
+ if (MRI.hasOneUse(FalseReg)) {
+ if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+ FalseDefMI->eraseFromParent();
+ }
+ } else {
+ // General path: spill FP registers to stack first
+ int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+ int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+
+ storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg);
+ storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg);
+
+ emitCtSelectWithPseudo(StackSlotSize / RegSizeInByte, TrueSlot, FalseSlot,
+ ResultSlot);
+ }
+
+ // Load result back as f64
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg),
+ ResultSlot);
+ break;
+ }
+ case X86::CT_SELECT_I386_FP80rr: {
+ // f80 is 80 bits (10 bytes), but stored with 12-byte alignment
+ unsigned StackObjectSize = 12;
+
+ // Check if both operands are simple memory loads
+ FPLoadMemOperands TrueMemOps =
+ getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp80m);
+ FPLoadMemOperands FalseMemOps =
+ getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp80m);
+
+ int ResultSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+
+ if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+ // Optimized path: load directly from memory as integers
+ // Works for both frame index loads (stack parameters) and
+ // constant pool loads (constants)
+ emitCtSelectFromMemory(StackObjectSize / RegSizeInByte, TrueMemOps,
+ FalseMemOps, ResultSlot);
+
+ // Erase the original FP load instructions since we're not using them
+ if (MRI.hasOneUse(TrueReg)) {
+ if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+ TrueDefMI->eraseFromParent();
+ }
+ if (MRI.hasOneUse(FalseReg)) {
+ if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+ FalseDefMI->eraseFromParent();
+ }
+ } else {
+ // General path: spill FP registers to stack first
+ int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+ int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+
+ storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg);
+ storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg);
+
+ emitCtSelectWithPseudo(StackObjectSize / RegSizeInByte, TrueSlot,
+ FalseSlot, ResultSlot);
+ }
+
+ // Load result back as f80
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg),
+ ResultSlot);
+ break;
+ }
+ default:
+ llvm_unreachable("Invalid CT_SELECT opcode");
+ }
+
+ MI.eraseFromParent();
+
+ return BB;
+}
+
MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
@@ -38635,6 +39366,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::CMOV_VK64:
return EmitLoweredSelect(MI, BB);
+ case X86::CT_SELECT_I386_GR8rr:
+ return emitCTSelectI386WithConditionMaterialization(
+ MI, BB, X86::CT_SELECT_I386_INT_GR8rr);
+
+ case X86::CT_SELECT_I386_GR16rr:
+ return emitCTSelectI386WithConditionMaterialization(
+ MI, BB, X86::CT_SELECT_I386_INT_GR16rr);
+
+ case X86::CT_SELECT_I386_GR32rr:
+ return emitCTSelectI386WithConditionMaterialization(
+ MI, BB, X86::CT_SELECT_I386_INT_GR32rr);
+
+ case X86::CT_SELECT_I386_FP32rr:
+ return emitCTSelectI386WithFpType(MI, BB, X86::CT_SELECT_I386_FP32rr);
+ case X86::CT_SELECT_I386_FP64rr:
+ return emitCTSelectI386WithFpType(MI, BB, X86::CT_SELECT_I386_FP64rr);
+ case X86::CT_SELECT_I386_FP80rr:
+ return emitCTSelectI386WithFpType(MI, BB, X86::CT_SELECT_I386_FP80rr);
+
case X86::FP80_ADDr:
case X86::FP80_ADDm32: {
// Change the floating point control register to use double extended
@@ -42653,7 +43403,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
return SDValue();
- Imm = llvm::rotl<uint8_t>(Imm, 4);
+ Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
DAG.getTargetConstant(Imm, DL, MVT::i8));
};
@@ -45699,16 +46449,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
- if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) {
- // iff we only need the signbit then we can use R directly.
- if (OriginalDemandedBits.isSignMask())
- return TLO.CombineTo(Op, Op.getOperand(1));
- // otherwise we just need R's signbit for the comparison.
- APInt SignMask = APInt::getSignMask(BitWidth);
- if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts,
- Known, TLO, Depth + 1))
- return true;
- }
+ // iff we only need the sign bit then we can use R directly.
+ if (OriginalDemandedBits.isSignMask() &&
+ ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
+ return TLO.CombineTo(Op, Op.getOperand(1));
break;
case X86ISD::MOVMSK: {
SDValue Src = Op.getOperand(0);
@@ -48657,15 +49401,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
DL, DAG, Subtarget))
return V;
- // If the sign bit is known then BLENDV can be folded away.
- if (N->getOpcode() == X86ISD::BLENDV) {
- KnownBits KnownCond = DAG.computeKnownBits(Cond);
- if (KnownCond.isNegative())
- return LHS;
- if (KnownCond.isNonNegative())
- return RHS;
- }
-
if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
SmallVector<int, 64> CondMask;
if (createShuffleMaskFromVSELECT(CondMask, Cond,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index fc16053caa705..c8d8f19e5cced 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -118,6 +118,10 @@ namespace llvm {
/// X86 Select
SELECTS,
+ /// X86 Constant-time Select, implemented with CMOV instruction. This is
+ /// used to implement constant-time select.
+ CT_SELECT,
+
// Same as SETCC except it's materialized with a sbb and the value is all
// one's or all zero's.
SETCC_CARRY, // R = carry_bit ? ~0 : 0
@@ -1173,6 +1177,8 @@ namespace llvm {
///
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ bool isSelectSupported(SelectSupportKind Kind) const override;
+
/// Replace the results of node with an illegal result
/// type with new values built out of custom code.
///
@@ -1803,6 +1809,7 @@ namespace llvm {
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCT_SELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerConditionalBranch(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 77a9c7a1f585f..6081be4a30e26 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -106,6 +106,211 @@ let Predicates = [HasCMOV, HasNDD] in {
def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
(CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
}
+
+// Create pseudo instruction and do the pattern matching to them.
+// We use a machine pass to lower these pseudos into cmov, in order
+// to avoid backend optimizations
+let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in {
+
+ multiclass CT_SELECT<X86TypeInfo t> {
+ // register-only
+ let isCommutable = 0, SchedRW = [WriteCMOV], Predicates = [HasNativeCMOV],
+ AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
+ def rr : PseudoI<(outs t.RegClass:$dst),
+ (ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond),
+ [(set t.RegClass:$dst, (X86ct_select t.RegClass:$src1, t.RegClass:$src2, timm:$cond, EFLAGS))]>;
+ }
+
+ // register-memory
+ let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], Predicates = [HasNativeCMOV],
+ AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
+ def rm : PseudoI<(outs t.RegClass:$dst),
+ (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond),
+ [(set t.RegClass:$dst, (X86ct_select t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>;
+ }
+ }
+}
+
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+ let Constraints = "$dst = $src1" in {
+ defm CT_SELECT16 : CT_SELECT<Xi16>;
+ defm CT_SELECT32 : CT_SELECT<Xi32>;
+ defm CT_SELECT64 : CT_SELECT<Xi64>;
+ }
+}
+
+// CT_SELECT_VEC base class
+class CT_SELECT_VEC<RegisterClass VRc, RegisterClass GRc>
+ : PseudoI<
+ (outs VRc:$dst, VRc:$tmpx, GRc:$tmpg),
+ (ins VRc:$t, VRc:$f, i8imm:$cond),
+ []
+ > {
+ let Uses = [EFLAGS];
+ let isPseudo = 1;
+ let isNotDuplicable = 1;
+ let hasSideEffects = 1;
+ let AsmString = "ctselect\t$dst, $f, $t, $cond";
+ let SchedRW = [];
+}
+
+// Width-specific class aliases
+class CT_SELECT_VEC128 : CT_SELECT_VEC<VR128, GR32>;
+class CT_SELECT_VEC128X : CT_SELECT_VEC<VR128X, GR32>;
+class CT_SELECT_VEC256 : CT_SELECT_VEC<VR256, GR32>;
+class CT_SELECT_VEC512 : CT_SELECT_VEC<VR512, GR32>;
+
+
+//===----------------------------------------------------------------------===//
+// 128-bit pseudos (SSE2 baseline; we use PXOR/PAND/MOVD/PSHUFD in the expander)
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE1] in {
+
+ def CT_SELECT_V4F32 : CT_SELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+}
+
+let Predicates = [HasSSE2] in {
+
+ def CT_SELECT_V2F64 : CT_SELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V4I32 : CT_SELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V2I64 : CT_SELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V8I16 : CT_SELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V16I8 : CT_SELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+
+ // If your build has v8f16, keep this; otherwise comment it out.
+ def CT_SELECT_V8F16 : CT_SELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+}
+
+let Predicates = [HasAVX] in {
+
+ def CT_SELECT_V4F32X : CT_SELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V2F64X : CT_SELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V4I32X : CT_SELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V2I64X : CT_SELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V8I16X : CT_SELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V16I8X : CT_SELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+
+ // If your build has v8f16, keep this; otherwise comment it out.
+ def CT_SELECT_V8F16X : CT_SELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// 256-bit pseudos
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVX] in {
+
+ def CT_SELECT_V8F32 : CT_SELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V4F64 : CT_SELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V8I32 : CT_SELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V4I64 : CT_SELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V16I16 : CT_SELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CT_SELECT_V32I8 : CT_SELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+
+ // If your build has v16f16, keep this; otherwise comment it out.
+ def CT_SELECT_V16F16 : CT_SELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Selection patterns: X86ct_select(...), EFLAGS -> CT_SELECT_V*
+//
+// NOTE:
+// * The SDNode carries Glue from CMP/TEST (due to SDNPInGlue).
+// * We list EFLAGS explicitly in the pattern (X86 style) to model the arch read.
+// * Temps (tmpx/tmpy,tmpg) are not in the pattern; they’re outs allocated by RA.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE1] in {
+
+ // 128-bit float (bitwise-equivalent ops in expander)
+ def : Pat<(v4f32 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V4F32 VR128:$t, VR128:$f, timm:$cc)>;
+}
+
+let Predicates = [HasSSE2] in {
+
+ // 128-bit integer
+ def : Pat<(v4i32 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V4I32 VR128:$t, VR128:$f, timm:$cc)>;
+ def : Pat<(v2i64 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V2I64 VR128:$t, VR128:$f, timm:$cc)>;
+ def : Pat<(v8i16 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V8I16 VR128:$t, VR128:$f, timm:$cc)>;
+ def : Pat<(v16i8 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V16I8 VR128:$t, VR128:$f, timm:$cc)>;
+ def : Pat<(v2f64 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V2F64 VR128:$t, VR128:$f, timm:$cc)>;
+
+ // 128-bit f16 (optional)
+ def : Pat<(v8f16 (X86ct_select VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V8F16 VR128:$t, VR128:$f, timm:$cc)>;
+}
+
+let Predicates = [HasAVX] in {
+
+ // 256-bit integer
+ def : Pat<(v8i32 (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V8I32 VR256:$t, VR256:$f, timm:$cc)>;
+ def : Pat<(v4i64 (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V4I64 VR256:$t, VR256:$f, timm:$cc)>;
+ def : Pat<(v16i16 (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V16I16 VR256:$t, VR256:$f, timm:$cc)>;
+ def : Pat<(v32i8 (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V32I8 VR256:$t, VR256:$f, timm:$cc)>;
+
+ // 256-bit float (bitwise-equivalent ops in expander)
+ def : Pat<(v8f32 (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V8F32 VR256:$t, VR256:$f, timm:$cc)>;
+ def : Pat<(v4f64 (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V4F64 VR256:$t, VR256:$f, timm:$cc)>;
+
+ // 256-bit f16 (optional)
+ def : Pat<(v16f16 (X86ct_select VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CT_SELECT_V16F16 VR256:$t, VR256:$f, timm:$cc)>;
+}
+
let Predicates = [HasCMOV, HasCF] in {
def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS),
(CFCMOV16rr GR16:$src1, (inv_cond_XFORM timm:$cond))>;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index f6fdc1cf59340..8b63c59720fcc 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -699,6 +699,87 @@ def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
(CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+// CT_SELECT
+// Enhanced CT_SELECT pseudos for i386 with temporary register allocation
+// These use a two-phase approach:
+// 1. Custom inserter materializes condition byte from EFLAGS
+// 2. Post-RA expansion generates constant-time instruction bundles
+
+let isPseudo = 1, isNotDuplicable = 1 in {
+ // Phase 1: Initial pseudos that consume EFLAGS (via custom inserter)
+ // These are matched by patterns and convert EFLAGS to condition byte
+ class CT_SELECT_I386_INITIAL<RegisterClass RC, ValueType VT>
+ : PseudoI<(outs RC:$dst),
+ (ins RC:$src1, RC:$src2, i8imm:$cond),
+ [(set RC:$dst, (VT(X86ct_select RC:$src1, RC:$src2, timm:$cond,
+ EFLAGS)))]> {
+ let Uses = [EFLAGS];
+ let Defs = [EFLAGS];
+ let usesCustomInserter = 1;
+ let hasNoSchedulingInfo = 1;
+ }
+
+ // Phase 2: Internal pseudos with pre-materialized condition byte (post-RA expansion)
+ // These generate the actual constant-time instruction bundles
+ class CT_SELECT_I386_INTERNAL<RegisterClass RC, RegisterClass ByteRC>
+ : PseudoI<(outs RC:$dst, ByteRC:$tmp_byte, RC:$tmp_mask),
+ (ins RC:$src1, RC:$src2, ByteRC:$cond_byte), []> {
+ let hasNoSchedulingInfo = 1;
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmp_byte, at earlyclobber $tmp_mask";
+ let Defs = [EFLAGS]; // NEG instruction in post-RA expansion clobbers EFLAGS
+ }
+}
+
+// Phase 1 pseudos for non-CMOV targets (custom inserter materializes condition)
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+ let Predicates = [NoNativeCMOV] in {
+ def CT_SELECT_I386_GR8rr : CT_SELECT_I386_INITIAL<GR8, i8>;
+ def CT_SELECT_I386_GR16rr : CT_SELECT_I386_INITIAL<GR16, i16>;
+ def CT_SELECT_I386_GR32rr : CT_SELECT_I386_INITIAL<GR32, i32>;
+ }
+}
+
+// Phase 2 pseudos (post-RA expansion with pre-materialized condition byte)
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+ let Predicates = [NoNativeCMOV] in {
+ def CT_SELECT_I386_INT_GR8rr :
+ CT_SELECT_I386_INTERNAL<GR8, GR8>;
+ def CT_SELECT_I386_INT_GR16rr :
+ CT_SELECT_I386_INTERNAL<GR16, GR8>;
+ def CT_SELECT_I386_INT_GR32rr :
+ CT_SELECT_I386_INTERNAL<GR32, GR8>;
+ }
+}
+
+let hasSideEffects = 1,
+ ForceDisassemble = 1,
+ Constraints = "$dst = $src1" in {
+
+ let Predicates = [FPStackf32] in
+ def CT_SELECT_I386_FP32rr : CT_SELECT_I386_INITIAL<RFP32, f32>;
+
+ let Predicates = [FPStackf64] in
+ def CT_SELECT_I386_FP64rr : CT_SELECT_I386_INITIAL<RFP64, f64>;
+
+ def CT_SELECT_I386_FP80rr : CT_SELECT_I386_INITIAL<RFP80, f80>;
+}
+
+// Pattern matching for non-native-CMOV CT_SELECT (routes to custom inserter for condition materialization)
+// NoNativeCMOV ensures these patterns are used when actual CMOV instruction is not available
+// even if canUseCMOV() is true (e.g., i386 with SSE which can emulate CMOV)
+let Predicates = [NoNativeCMOV] in {
+ def : Pat<(i8(X86ct_select GR8:$src1, GR8:$src2, timm:$cond, EFLAGS)),
+ (CT_SELECT_I386_GR8rr GR8:$src1, GR8:$src2, timm:$cond)>;
+
+ def : Pat<(i16(X86ct_select GR16:$src1, GR16:$src2, timm:$cond, EFLAGS)),
+ (CT_SELECT_I386_GR16rr GR16:$src1, GR16:$src2, timm:$cond)>;
+
+ def : Pat<(i32(X86ct_select GR32:$src1, GR32:$src2, timm:$cond, EFLAGS)),
+ (CT_SELECT_I386_GR32rr GR32:$src1, GR32:$src2, timm:$cond)>;
+
+ // i64 patterns handled automatically by type legalization
+}
+
//===----------------------------------------------------------------------===//
// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index adbb8b821700a..7ad92e3849c9c 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -28,6 +28,10 @@ def SDTX86Cmov : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+def SDTX86CtSelect : SDTypeProfile<1, 4,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
// Unary and binary operator instructions that set EFLAGS as a side-effect.
def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
[SDTCisSameAs<0, 2>,
@@ -154,6 +158,7 @@ def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>;
def X86cload : SDNode<"X86ISD::CLOAD", SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def X86cstore : SDNode<"X86ISD::CSTORE", SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86ct_select: SDNode<"X86ISD::CT_SELECT", SDTX86CtSelect, [SDNPInGlue]>;
def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
[SDNPHasChain]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 2479a8dccfb00..d4a46048a1d20 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -474,6 +474,556 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
return false;
}
+struct CtSelectInstructions {
+ unsigned PAndOpc;
+ unsigned PAndnOpc;
+ unsigned POrOpc;
+ unsigned BroadcastOpc;
+ unsigned IntMoveOpc;
+ unsigned MoveOpc;
+ bool Use256;
+ bool UseBlendInstr;
+};
+
+static CtSelectInstructions
+getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) {
+ CtSelectInstructions Instructions = {};
+
+ switch (Opcode) {
+ case X86::CT_SELECT_V2F64:
+ if (Subtarget.hasSSE2()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVAPDrr;
+ Instructions.UseBlendInstr = true;
+ } else {
+ llvm_unreachable("Double precision vectors require SSE2");
+ }
+ break;
+ case X86::CT_SELECT_V4F32:
+ if (Subtarget.hasSSE41()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVAPSrr;
+ Instructions.UseBlendInstr = true;
+ } else if (Subtarget.hasSSE2()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVAPSrr;
+ } else {
+ // fallback to SSE1, only support four 32-bit single precision
+ // floating-point values
+ Instructions.PAndOpc = X86::ANDPSrr;
+ Instructions.PAndnOpc = X86::ANDNPSrr;
+ Instructions.POrOpc = X86::ORPSrr;
+ Instructions.BroadcastOpc = X86::SHUFPSrri;
+ Instructions.IntMoveOpc = X86::MOVSS2DIrr;
+ Instructions.MoveOpc = X86::MOVAPSrr;
+ }
+ break;
+ case X86::CT_SELECT_V4I32:
+ case X86::CT_SELECT_V2I64:
+ case X86::CT_SELECT_V8I16:
+ case X86::CT_SELECT_V16I8:
+ if (Subtarget.hasSSE2()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVDQArr;
+ } else {
+ llvm_unreachable("Integer vector operations require SSE2");
+ }
+ break;
+ case X86::CT_SELECT_V8F16:
+ if (Subtarget.hasSSE2()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVDQArr;
+ } else {
+ llvm_unreachable("FP16 vector operations require SSE2");
+ }
+ break;
+ case X86::CT_SELECT_V4F32X:
+ case X86::CT_SELECT_V4I32X:
+ case X86::CT_SELECT_V2F64X:
+ case X86::CT_SELECT_V2I64X:
+ case X86::CT_SELECT_V8I16X:
+ case X86::CT_SELECT_V16I8X:
+ case X86::CT_SELECT_V8F16X:
+ if (Subtarget.hasAVX()) {
+ Instructions.PAndOpc = X86::VPANDrr;
+ Instructions.PAndnOpc = X86::VPANDNrr;
+ Instructions.POrOpc = X86::VPORrr;
+ Instructions.BroadcastOpc = X86::VPSHUFDri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc = (Opcode == X86::CT_SELECT_V4F32X) ? X86::VMOVAPSrr
+ : (Opcode == X86::CT_SELECT_V2F64X)
+ ? X86::VMOVAPDrr
+ : X86::VMOVDQArr;
+ } else {
+ llvm_unreachable("AVX variants require AVX support");
+ }
+ break;
+ case X86::CT_SELECT_V8F32:
+ case X86::CT_SELECT_V8I32:
+ if (Subtarget.hasAVX()) {
+ Instructions.PAndOpc = X86::VPANDYrr;
+ Instructions.PAndnOpc = X86::VPANDNYrr;
+ Instructions.POrOpc = X86::VPORYrr;
+ Instructions.BroadcastOpc = X86::VPERMILPSYri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc =
+ (Opcode == X86::CT_SELECT_V8F32) ? X86::VMOVAPSYrr : X86::VMOVDQAYrr;
+ Instructions.Use256 = true;
+ } else {
+ llvm_unreachable("256-bit vectors require AVX");
+ }
+ break;
+ case X86::CT_SELECT_V4F64:
+ case X86::CT_SELECT_V4I64:
+ if (Subtarget.hasAVX()) {
+ Instructions.PAndOpc = X86::VPANDYrr;
+ Instructions.PAndnOpc = X86::VPANDNYrr;
+ Instructions.POrOpc = X86::VPORYrr;
+ Instructions.BroadcastOpc = X86::VPERMILPDYri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc =
+ (Opcode == X86::CT_SELECT_V4F64) ? X86::VMOVAPDYrr : X86::VMOVDQAYrr;
+ Instructions.Use256 = true;
+ } else {
+ llvm_unreachable("256-bit vectors require AVX");
+ }
+ break;
+ case X86::CT_SELECT_V16I16:
+ case X86::CT_SELECT_V32I8:
+ case X86::CT_SELECT_V16F16:
+ if (Subtarget.hasAVX2()) {
+ Instructions.PAndOpc = X86::VPANDYrr;
+ Instructions.PAndnOpc = X86::VPANDNYrr;
+ Instructions.POrOpc = X86::VPORYrr;
+ Instructions.BroadcastOpc = X86::VPERMILPSYri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc = X86::VMOVDQAYrr;
+ Instructions.Use256 = true;
+ } else if (Subtarget.hasAVX()) {
+ Instructions.PAndOpc = X86::VPANDYrr;
+ Instructions.PAndnOpc = X86::VPANDNYrr;
+ Instructions.POrOpc = X86::VPORYrr;
+ Instructions.BroadcastOpc = X86::VPERMILPSYri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc = X86::VMOVDQAYrr;
+ Instructions.Use256 = true;
+ } else {
+ llvm_unreachable("256-bit integer vectors require AVX");
+ }
+ break;
+ default:
+ llvm_unreachable("Unexpected CT_SELECT opcode");
+ }
+
+ return Instructions;
+}
+
+bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ const DebugLoc &DL = MI.getDebugLoc();
+ auto Instruction = getCtSelectInstructions(Opcode, Subtarget);
+
+ MachineBasicBlock *MBB = MI.getParent();
+
+ // Operand layout matches the TableGen definition:
+ // (outs VR128:$dst, VR128:$tmpx, GR32:$tmpg),
+ // (ins VR128:$t, VR128:$f, i8imm:$cond)
+ Register Dst = MI.getOperand(0).getReg();
+ Register MaskReg = MI.getOperand(1).getReg(); // vector mask temp
+ Register TmpGPR = MI.getOperand(2).getReg(); // scalar mask temp (GPR32)
+ Register FalseVal = MI.getOperand(3).getReg(); // true_value
+ Register TrueVal = MI.getOperand(4).getReg(); // false_value
+ X86::CondCode CC = X86::CondCode(MI.getOperand(5).getImm()); // condition
+
+ MachineInstr *FirstInstr = nullptr;
+ MachineInstr *LastInstr = nullptr;
+ auto recordInstr = [&](MachineInstrBuilder MIB) {
+ MachineInstr *NewMI = MIB.getInstr();
+ LastInstr = NewMI;
+ if (!FirstInstr)
+ FirstInstr = NewMI;
+ };
+
+ // Create scalar mask in tempGPR and broadcast to vector mask
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR)
+ .addImm(0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ auto SubReg = TRI->getSubReg(TmpGPR, X86::sub_8bit);
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::SETCCr))
+ .addReg(SubReg)
+ .addImm(CC)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // Zero-extend byte to 32-bit register (movzbl %al, %eax)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVZX32rr8), TmpGPR)
+ .addReg(SubReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) {
+ // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31,
+ // %eax)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR)
+ .addReg(TmpGPR)
+ .addImm(31));
+ } else {
+ // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR)
+ .addReg(TmpGPR));
+ }
+
+ // Broadcast to TmpX (vector mask)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg)
+ .addReg(MaskReg)
+ .addReg(MaskReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // Move scalar mask to vector register
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.IntMoveOpc), MaskReg)
+ .addReg(TmpGPR)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ if (Instruction.Use256) {
+ // Broadcast to 256-bit vector register
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+ .addReg(MaskReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ } else {
+ if (Subtarget.hasSSE2() || Subtarget.hasAVX()) {
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+ .addReg(MaskReg)
+ .addImm(0x00)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ } else {
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+ .addReg(MaskReg)
+ .addReg(MaskReg)
+ .addImm(0x00)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ }
+ }
+
+ if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) {
+ // Use dedicated blend instructions for SSE4.1+
+ unsigned BlendOpc;
+ switch (Opcode) {
+ case X86::CT_SELECT_V4F32:
+ BlendOpc = X86::BLENDVPSrr0;
+ break;
+ case X86::CT_SELECT_V2F64:
+ BlendOpc = X86::BLENDVPDrr0;
+ break;
+ default:
+ // alias for pblendvb that takes xmm0 as implicit mask register
+ BlendOpc = X86::PBLENDVBrr0;
+ break;
+ }
+
+ // Check if XMM0 is used as one of source registers, if yes then save it
+ // in Dst register and update FalseVal and TrueVal to Dst register
+ bool DidSaveXMM0 = false;
+ Register SavedXMM0 = X86::XMM0;
+ if (FalseVal == X86::XMM0 || TrueVal == X86::XMM0) {
+ Register SrcXMM0 = (FalseVal == X86::XMM0) ? FalseVal : TrueVal;
+
+ // if XMM0 is one of the source registers, it will not match with Dst
+ // registers, so we need to move it to Dst register
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(SrcXMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // update FalseVal and TrueVal to Dst register
+ if (FalseVal == X86::XMM0)
+ FalseVal = Dst;
+ if (TrueVal == X86::XMM0)
+ TrueVal = Dst;
+
+ // update SavedXMM0 to Dst register
+ SavedXMM0 = Dst;
+
+ // set DidSaveXMM0 to true to indicate that we saved XMM0 into Dst
+ // register
+ DidSaveXMM0 = true;
+ } else if (MaskReg != X86::XMM0 && Dst != X86::XMM0) {
+
+ // if XMM0 is not allocated for any of the register, we stil need to save
+ // and restore it after using as mask register
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(X86::XMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ SavedXMM0 = Dst;
+ DidSaveXMM0 = true;
+ }
+
+ if (MaskReg != X86::XMM0) {
+ // BLENDV uses XMM0 as implicit mask register
+ // https://www.felixcloutier.com/x86/pblendvb
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge));
+
+ // move FalseVal to mask (use MaskReg as the dst of the blend)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), MaskReg)
+ .addReg(FalseVal)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // MaskReg := blend(MaskReg /*false*/, TrueVal /*true*/) ; mask in
+ // xmm0
+ recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), MaskReg)
+ .addReg(MaskReg)
+ .addReg(TrueVal)
+ .addReg(X86::XMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // restore XMM0 from SavedXMM0 if we saved it into Dst
+ if (DidSaveXMM0) {
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0)
+ .addReg(SavedXMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ }
+ // dst = result (now in MaskReg)
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(MaskReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ } else {
+ // move FalseVal to Dst register since MaskReg is XMM0 and Dst is not
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(FalseVal)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // Dst := blend(Dst /*false*/, TrueVal /*true*/) ; mask in
+ // xmm0
+ recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), Dst)
+ .addReg(Dst)
+ .addReg(TrueVal)
+ .addReg(X86::XMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ }
+ } else {
+
+ // dst = mask
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(MaskReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // mask &= true_val
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDrr), MaskReg)
+ .addReg(MaskReg)
+ .addReg(TrueVal)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // dst = ~mask & false_val
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDNrr), Dst)
+ .addReg(Dst)
+ .addReg(FalseVal)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // dst |= mask; (mask & t) | (~mask & f)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::PORrr), Dst)
+ .addReg(Dst)
+ .addReg(MaskReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ }
+
+ assert(FirstInstr && LastInstr && "Expected at least one expanded instruction");
+ auto BundleEnd = LastInstr->getIterator();
+ finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd));
+
+ MI.eraseFromParent();
+
+ return true;
+}
+
+bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // CT_SELECT pseudo has: (outs dst), (ins true_val, false_val, cond)
+ MachineOperand &OperandRes = MI.getOperand(0); // destination register
+ MachineOperand &OperandTrue = MI.getOperand(1); // true value
+ MachineOperand &OperandCond = MI.getOperand(3); // condition code
+
+ assert(OperandTrue.isReg() && OperandRes.isReg() && OperandCond.isImm() &&
+ "Invalid operand types");
+ assert(OperandTrue.getReg() == OperandRes.getReg() &&
+ "Result register different from True register");
+
+ assert(Subtarget.hasCMOV() && "target does not support CMOV instructions");
+
+ unsigned Opcode = 0;
+
+ switch (MI.getOpcode()) {
+ case X86::CT_SELECT16rr:
+ Opcode = X86::CMOV16rr;
+ break;
+ case X86::CT_SELECT32rr:
+ Opcode = X86::CMOV32rr;
+ break;
+ case X86::CT_SELECT64rr:
+ Opcode = X86::CMOV64rr;
+ break;
+ case X86::CT_SELECT16rm:
+ Opcode = X86::CMOV16rm;
+ break;
+ case X86::CT_SELECT32rm:
+ Opcode = X86::CMOV32rm;
+ break;
+ case X86::CT_SELECT64rm:
+ Opcode = X86::CMOV64rm;
+ break;
+ default:
+ llvm_unreachable("Invalid CT_SELECT opcode");
+ }
+
+ if (!Subtarget.hasCMOV()) {
+ llvm_unreachable("target does not support cmov");
+ }
+
+ // Build CMOV instruction: copy the first 3 operands (dst, true, false)
+ // and add condition code
+ MachineInstrBuilder CmovBuilder = BuildMI(*MBB, MI, DL, get(Opcode));
+ for (unsigned i = 0u; i < MI.getNumOperands(); ++i) { // Copy
+ CmovBuilder.add(MI.getOperand(i));
+ }
+
+ // Remove the original CT_SELECT instruction
+ MI.eraseFromParent();
+ return true;
+}
+
+/// Expand i386-specific CT_SELECT pseudo instructions (post-RA, constant-time)
+/// These internal pseudos receive a pre-materialized condition byte from the
+/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization.
+bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // CT_SELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask),
+ // (ins src1, src2, cond_byte)
+ // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent
+ Register DstReg = MI.getOperand(0).getReg();
+ Register TmpByteReg = MI.getOperand(1).getReg();
+ Register TmpMaskReg = MI.getOperand(2).getReg();
+ Register Src1Reg = MI.getOperand(3).getReg();
+ Register Src2Reg = MI.getOperand(4).getReg();
+ Register CondByteReg = MI.getOperand(5).getReg(); // Pre-materialized condition byte
+
+ // Determine instruction opcodes based on register width
+ unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp;
+ if (MI.getOpcode() == X86::CT_SELECT_I386_INT_GR8rr) {
+ MovZXOp = 0; // No zero-extend needed for GR8
+ NegOp = X86::NEG8r;
+ MovOp = X86::MOV8rr;
+ AndOp = X86::AND8rr;
+ NotOp = X86::NOT8r;
+ OrOp = X86::OR8rr;
+ } else if (MI.getOpcode() == X86::CT_SELECT_I386_INT_GR16rr) {
+ MovZXOp = X86::MOVZX16rr8;
+ NegOp = X86::NEG16r;
+ MovOp = X86::MOV16rr;
+ AndOp = X86::AND16rr;
+ NotOp = X86::NOT16r;
+ OrOp = X86::OR16rr;
+ } else { // X86::CT_SELECT_I386_INT_GR32rr
+ MovZXOp = X86::MOVZX32rr8;
+ NegOp = X86::NEG32r;
+ MovOp = X86::MOV32rr;
+ AndOp = X86::AND32rr;
+ NotOp = X86::NOT32r;
+ OrOp = X86::OR32rr;
+ }
+
+ // 7-instruction constant-time selection bundle (no SETCC inside):
+ // result = (true_val & mask) | (false_val & ~mask)
+ // The condition byte is already materialized, avoiding EFLAGS dependency
+
+ // Step 1: Copy pre-materialized condition byte to TmpByteReg
+ // This allows the bundle to work with allocated temporaries
+ auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg)
+ .addReg(CondByteReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+ auto BundleStart = I1->getIterator();
+
+ // Step 2: Zero-extend condition byte to register width (0 or 1)
+ if (MI.getOpcode() != X86::CT_SELECT_I386_INT_GR8rr) {
+ BuildMI(*MBB, MI, DL, get(MovZXOp), TmpMaskReg)
+ .addReg(TmpByteReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+ }
+
+ // Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...)
+ Register MaskReg = (MI.getOpcode() == X86::CT_SELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg;
+ BuildMI(*MBB, MI, DL, get(NegOp), MaskReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Step 4,5: Apply mask to true value - copy src1 to dest, then AND with mask
+ BuildMI(*MBB, MI, DL, get(MovOp), DstReg)
+ .addReg(Src1Reg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ BuildMI(*MBB, MI, DL, get(AndOp), DstReg)
+ .addReg(DstReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Step 6: Create inverted mask inline (~mask)
+ BuildMI(*MBB, MI, DL, get(NotOp), MaskReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Step 7: Apply inverted mask to false value - reuse mask register directly
+ BuildMI(*MBB, MI, DL, get(AndOp), MaskReg)
+ .addReg(MaskReg)
+ .addReg(Src2Reg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Step 8: Final result: (src1 & mask) | (src2 & ~mask)
+ auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg)
+ .addReg(DstReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Bundle all generated instructions for atomic execution before removing MI
+ auto BundleEnd = std::next(LI->getIterator());
+ if (BundleStart != BundleEnd) {
+ // Only bundle if we have multiple instructions
+ finalizeBundle(*MBB, BundleStart, BundleEnd);
+ }
+
+ // TODO: Optimization opportunity - The register allocator may choose callee-saved
+ // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary
+ // save/restore overhead. Consider constraining these to caller-saved register
+ // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve
+ // constant-time performance by eliminating prologue/epilogue instructions.
+
+ // Remove the original pseudo instruction
+ MI.eraseFromParent();
+ return true;
+}
+
static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) {
switch (Opcode) {
default:
@@ -6426,6 +6976,43 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::ADD64ri32_DB:
MIB->setDesc(get(X86::OR64ri32));
break;
+
+ case X86::CT_SELECT64rr:
+ case X86::CT_SELECT32rr:
+ case X86::CT_SELECT16rr:
+ case X86::CT_SELECT64rm:
+ case X86::CT_SELECT32rm:
+ case X86::CT_SELECT16rm:
+ // These CT_SELECT pseudos are only selected when CMOV is available
+ // Pattern matching ensures we use CT_SELECT_I386 when CMOV is not available
+ return expandCtSelectWithCMOV(MI);
+
+ // non-cmov CT_SELECT expansion (post-RA, constant-time)
+ // These are the internal pseudos with pre-materialized condition byte
+ case X86::CT_SELECT_I386_INT_GR8rr:
+ case X86::CT_SELECT_I386_INT_GR16rr:
+ case X86::CT_SELECT_I386_INT_GR32rr:
+ return expandCtSelectIntWithoutCMOV(MI);
+
+ case X86::CT_SELECT_V2F64:
+ case X86::CT_SELECT_V4F32:
+ case X86::CT_SELECT_V2I64:
+ case X86::CT_SELECT_V4I32:
+ case X86::CT_SELECT_V8I16:
+ case X86::CT_SELECT_V16I8:
+ case X86::CT_SELECT_V2F64X:
+ case X86::CT_SELECT_V4F32X:
+ case X86::CT_SELECT_V2I64X:
+ case X86::CT_SELECT_V4I32X:
+ case X86::CT_SELECT_V8I16X:
+ case X86::CT_SELECT_V16I8X:
+ case X86::CT_SELECT_V4I64:
+ case X86::CT_SELECT_V8I32:
+ case X86::CT_SELECT_V16I16:
+ case X86::CT_SELECT_V32I8:
+ case X86::CT_SELECT_V4F64:
+ case X86::CT_SELECT_V8F32:
+ return expandCtSelectVector(MI);
}
return false;
}
@@ -10612,27 +11199,39 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
if (!ST.hasSSE1())
return;
- BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
+ // PXOR is safe to use because it doesn't affect flags.
+ BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
} else if (X86::VR256RegClass.contains(Reg)) {
// YMM#
if (!ST.hasAVX())
return;
- BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
+ // VPXOR is safe to use because it doesn't affect flags.
+ BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
} else if (X86::VR512RegClass.contains(Reg)) {
// ZMM#
if (!ST.hasAVX512())
return;
- BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
+ // VPXORY is safe to use because it doesn't affect flags.
+ BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
} else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
X86::VK16RegClass.contains(Reg)) {
if (!ST.hasVLX())
return;
- unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
- BuildMI(MBB, Iter, DL, get(Op), Reg);
+ // KXOR is safe to use because it doesn't affect flags.
+ unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk;
+ BuildMI(MBB, Iter, DL, get(Op), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
}
}
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index de8ccb44578a3..76f18803c2e3d 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -721,6 +721,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
int &FrameIndex) const;
+ /// Expand the CT_SELECT pseudo-instructions.
+ bool expandCtSelectWithCMOV(MachineInstr &MI) const;
+ bool expandCtSelectIntWithoutCMOV(MachineInstr &MI) const;
+
+ bool expandCtSelectVector(MachineInstr &MI) const;
+
/// Returns true iff the routine could find two commutable operands in the
/// given machine instruction with 3 vector inputs.
/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 21e6bacbacee2..5fa0665668e43 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -53,6 +53,11 @@ def PreferNoLegacySetCC : Predicate<"Subtarget->hasZU() && "
def HasCF : Predicate<"Subtarget->hasCF()">;
def HasCMOV : Predicate<"Subtarget->canUseCMOV()">;
def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">;
+// Predicates for native CMOV instruction (checks hasCMOV(), not canUseCMOV())
+// HasCMOV may be true even without native CMOV (e.g., via SSE emulation)
+// Use HasNativeCMOV/NoNativeCMOV for constant-time code that requires actual CMOV
+def HasNativeCMOV : Predicate<"Subtarget->hasCMOV()">;
+def NoNativeCMOV : Predicate<"!Subtarget->hasCMOV()">;
def HasNOPL : Predicate<"Subtarget->hasNOPL()">;
def HasMMX : Predicate<"Subtarget->hasMMX()">;
def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 5305b39cffefd..48bcdb41ba6e2 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -630,10 +630,11 @@ void X86PassConfig::addPreEmitPass2() {
// ObjC runtime functions present in the module.
const Function &F = MF.getFunction();
const Module *M = F.getParent();
- return M->getModuleFlag("kcfi") ||
+ return M->getModuleFlag("kcfi") || F.hasFnAttribute("ct-select") ||
(TT.isOSDarwin() &&
(M->getFunction("objc_retainAutoreleasedReturnValue") ||
- M->getFunction("objc_unsafeClaimAutoreleasedReturnValue")));
+ M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) ||
+ F.hasFnAttribute("ct-select");
}));
// Analyzes and emits pseudos to support Win x64 Unwind V2. This pass must run
diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll
new file mode 100644
index 0000000000000..0797265972a1f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll
@@ -0,0 +1,409 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32
+
+; Test ct.select edge cases and corner cases
+
+; Test with very large integers
+define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) {
+; X64-LABEL: test_ctselect_i128:
+; X64: # %bb.0:
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
+; X64-NEXT: cmovneq %rdx, %r8
+; X64-NEXT: movq %r8, %rdx
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_i128:
+; X32: # %bb.0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edi
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, 12(%eax)
+; X32-NEXT: movl %edx, 8(%eax)
+; X32-NEXT: movl %edi, 4(%eax)
+; X32-NEXT: movl %esi, (%eax)
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: retl $4
+ %result = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b)
+ ret i128 %result
+}
+
+; Test with small integer types
+define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+; X64-LABEL: test_ctselect_i1:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_i1:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: # kill: def $al killed $al killed $eax
+; X32-NEXT: retl
+ %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+ ret i1 %result
+}
+
+; Test with extremal values
+define i32 @test_ctselect_extremal_values(i1 %cond) {
+; X64-LABEL: test_ctselect_extremal_values:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_extremal_values:
+; X32: # %bb.0:
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X32-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: retl
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648)
+ ret i32 %result
+}
+
+; Test with floating point special values
+define float @test_ctselect_f32_special_values(i1 %cond) {
+; X64-LABEL: test_ctselect_f32_special_values:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT: movl $2139095040, %ecx # imm = 0x7F800000
+; X64-NEXT: cmovnel %eax, %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_f32_special_values:
+; X32: # %bb.0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: sete %al
+; X32-NEXT: movl {{\.?LCPI[0-9]+_[0-9]+}}, %ecx
+; X32-NEXT: movl {{\.?LCPI[0-9]+_[0-9]+}}, %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
+ ret float %result
+}
+
+define double @test_ctselect_f64_special_values(i1 %cond) {
+; X64-LABEL: test_ctselect_f64_special_values:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; X64-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+; X64-NEXT: cmovneq %rax, %rcx
+; X64-NEXT: movq %rcx, %xmm0
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_f64_special_values:
+; X32: # %bb.0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: subl $24, %esp
+; X32-NEXT: .cfi_def_cfa_offset 36
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X32-NEXT: sete %al
+; X32-NEXT: fxch %st(1)
+; X32-NEXT: fstpl {{[0-9]+}}(%esp)
+; X32-NEXT: fstpl (%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl (%esp), %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: fldl {{[0-9]+}}(%esp)
+; X32-NEXT: addl $24, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: retl
+ %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000)
+ ret double %result
+}
+
+; Test with null pointers
+define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+; X64-LABEL: test_ctselect_null_ptr:
+; X64: # %bb.0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_null_ptr:
+; X32: # %bb.0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+ ret ptr %result
+}
+
+; Test with function pointers
+define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+; X64-LABEL: test_ctselect_function_ptr:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_function_ptr:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+ ret ptr %result
+}
+
+; Test with volatile loads
+define i32 @test_ctselect_volatile_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_volatile_load:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rsi), %ecx
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_volatile_load:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl (%ecx), %ecx
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: retl
+ %a = load volatile i32, ptr %p1
+ %b = load volatile i32, ptr %p2
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test with atomic loads
+define i32 @test_ctselect_atomic_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_atomic_load:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rsi), %ecx
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_atomic_load:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl (%ecx), %ecx
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: retl
+ %a = load atomic i32, ptr %p1 acquire, align 4
+ %b = load atomic i32, ptr %p2 acquire, align 4
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test with condition from icmp on pointers
+define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_ptr_cmp:
+; X64: # %bb.0:
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: cmpq %rsi, %rdi
+; X64-NEXT: sete %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovneq %rdx, %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_ptr_cmp:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: sete %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+ %cmp = icmp eq ptr %p1, %p2
+ %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b)
+ ret ptr %result
+}
+
+; Test with struct pointer types (struct types themselves may not be directly supported)
+%struct.pair = type { i32, i32 }
+
+define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_struct_ptr:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_struct_ptr:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+ ret ptr %result
+}
+
+; Test with deeply nested conditions (stress test for instruction selection)
+define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+; X64-LABEL: test_ctselect_deeply_nested:
+; X64: # %bb.0:
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %r11d
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %r8d, %r9d
+; X64-NEXT: testb $1, %sil
+; X64-NEXT: cmovnel %r9d, %r11d
+; X64-NEXT: testb $1, %dl
+; X64-NEXT: cmovnel %r11d, %r10d
+; X64-NEXT: testb $1, %cl
+; X64-NEXT: cmovnel %r10d, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_deeply_nested:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_offset %esi, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %esi, %edx
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %edx, %ecx
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: retl
+ %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+ %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+ %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+ %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e)
+ ret i32 %sel4
+}
+
+; Test with misaligned loads
+define i32 @test_ctselect_misaligned_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_misaligned_load:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel (%rsi), %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_misaligned_load:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel (%ecx), %eax
+; X32-NEXT: retl
+ %a = load i32, ptr %p1, align 1
+ %b = load i32, ptr %p2, align 1
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
+declare i128 @llvm.ct.select.i128(i1, i128, i128)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
new file mode 100644
index 0000000000000..b88ec72a37925
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Comprehensive CT_SELECT tests for i386 targets with floating-point types
+; - Without CMOV: constant-time implementation using FP->int conversion + existing post-RA CT_SELECT
+; - With CMOV: CMOV-based implementation
+; - Verifies security properties: no conditional branches, constant execution time
+; Strategy: FP values stored to memory, converted to integers, CT_SELECT on integers, converted back to FP
+
+; Test basic f32 functionality
+define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+ ret float %result
+}
+
+; Test f32 with different condition codes
+define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_eq:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fucompp
+; I386-NOCMOV-NEXT: fnstsw %ax
+; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax
+; I386-NOCMOV-NEXT: sahf
+; I386-NOCMOV-NEXT: setnp %al
+; I386-NOCMOV-NEXT: sete %cl
+; I386-NOCMOV-NEXT: testb %al, %cl
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_eq:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fucompi %st(1), %st
+; I386-CMOV-NEXT: fstp %st(0)
+; I386-CMOV-NEXT: setnp %al
+; I386-CMOV-NEXT: sete %cl
+; I386-CMOV-NEXT: testb %al, %cl
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %cmp = fcmp oeq float %x, %y
+ %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b)
+ ret float %result
+}
+
+; Test basic f64 functionality
+define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f64_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $8, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldl (%esp)
+; I386-NOCMOV-NEXT: addl $8, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f64_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $8, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldl (%esp)
+; I386-CMOV-NEXT: addl $8, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+ ret double %result
+}
+
+; Test basic x86_fp80 functionality
+define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f80_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $12, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt (%esp)
+; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f80_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldt (%esp)
+; I386-CMOV-NEXT: addl $12, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
+ ret x86_fp80 %result
+}
+
+; Test f32 with complex conditions
+define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_gt:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fucompp
+; I386-NOCMOV-NEXT: fnstsw %ax
+; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax
+; I386-NOCMOV-NEXT: sahf
+; I386-NOCMOV-NEXT: seta %al
+; I386-NOCMOV-NEXT: testb %al, %al
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_gt:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fucompi %st(1), %st
+; I386-CMOV-NEXT: fstp %st(0)
+; I386-CMOV-NEXT: seta %al
+; I386-CMOV-NEXT: testb %al, %al
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %cmp = fcmp ogt float %x, %y
+ %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b)
+ ret float %result
+}
+
+; Test constant-time properties: verify no branches in generated code
+define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_no_branches:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_no_branches:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+ ret float %result
+}
+
+; Test that BUNDLE directives are present for constant-time guarantees
+define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_bundled:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_bundled:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+ ret float %result
+}
+
+; Test edge case: NaN handling
+define float @test_ctselect_f32_nan(i1 %cond) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_nan:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $12, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; I386-NOCMOV-NEXT: fldz
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: fxch %st(1)
+; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstps (%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl (%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_nan:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; I386-CMOV-NEXT: fldz
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: fxch %st(1)
+; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstps (%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl (%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: addl $12, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %nan = bitcast i32 2139095040 to float ; 0x7F800000 = +inf
+ %zero = bitcast i32 0 to float
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %nan, float %zero)
+ ret float %result
+}
+
+; Test memory alignment for f80
+define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f80_alignment:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $12, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt (%esp)
+; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f80_alignment:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldt (%esp)
+; I386-CMOV-NEXT: addl $12, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
+ ret x86_fp80 %result
+}
+
+; Stress test: multiple CT_SELECT operations
+define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float %b, float %c, float %d) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_multiple:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $8, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $8, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_multiple:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $8, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $8, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %sel1 = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b)
+ %sel2 = call float @llvm.ct.select.f32(i1 %cond2, float %sel1, float %c)
+ ret float %sel2
+}
+
+; Declare intrinsics
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare x86_fp80 @llvm.ct.select.f80(i1, x86_fp80, x86_fp80)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
new file mode 100644
index 0000000000000..6851c5babeb2d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
@@ -0,0 +1,428 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx < %s | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+cmov,+mmx < %s | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx -verify-machineinstrs < %s | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Test constant-time selection with MMX intrinsics to exercise VR64 CT_SELECT
+; These tests use MMX intrinsics to create <1 x i64> values that get allocated to VR64 registers
+
+; Test MMX ct.select using paddd intrinsic to force VR64 allocation
+define <1 x i64> @test_mmx_ctselect_with_paddd(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_paddd:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: subl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT: .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT: .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %bl
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: sete %bh
+; I386-NOCMOV-NEXT: movb %bh, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %esi, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %ecx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %ecx
+; I386-NOCMOV-NEXT: andl %esi, %ecx
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: orl %esi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT: paddd %mm0, %mm0
+; I386-NOCMOV-NEXT: movq %mm0, (%esp)
+; I386-NOCMOV-NEXT: movl (%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: addl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: popl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_paddd:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: subl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %dl
+; I386-CMOV-NEXT: testb %dl, %dl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT: paddd %mm0, %mm0
+; I386-CMOV-NEXT: movq %mm0, (%esp)
+; I386-CMOV-NEXT: movl (%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: addl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT: retl
+ %mmx_a = bitcast i64 %a to <1 x i64>
+ %mmx_b = bitcast i64 %b to <1 x i64>
+ %cmp = icmp ne i32 %cond, 0
+ %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+ %result = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %sel, <1 x i64> %sel)
+ ret <1 x i64> %result
+}
+
+; Test MMX ct.select using psllw intrinsic
+define <1 x i64> @test_mmx_ctselect_with_psllw(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_psllw:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: subl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT: .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT: .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %bl
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: sete %bh
+; I386-NOCMOV-NEXT: movb %bh, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %esi, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %ecx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %ecx
+; I386-NOCMOV-NEXT: andl %esi, %ecx
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: orl %esi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT: psllw %mm0, %mm0
+; I386-NOCMOV-NEXT: movq %mm0, (%esp)
+; I386-NOCMOV-NEXT: movl (%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: addl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: popl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_psllw:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: subl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %dl
+; I386-CMOV-NEXT: testb %dl, %dl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT: psllw %mm0, %mm0
+; I386-CMOV-NEXT: movq %mm0, (%esp)
+; I386-CMOV-NEXT: movl (%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: addl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT: retl
+ %mmx_a = bitcast i64 %a to <1 x i64>
+ %mmx_b = bitcast i64 %b to <1 x i64>
+ %cmp = icmp ne i32 %cond, 0
+ %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+ %result = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %sel, <1 x i64> %sel)
+ ret <1 x i64> %result
+}
+
+; Test nested MMX ct.selects with pand intrinsic
+define <1 x i64> @test_mmx_nested_ctselect_with_pand(i32 %cond1, i32 %cond2, i64 %a, i64 %b, i64 %c) {
+; I386-NOCMOV-LABEL: test_mmx_nested_ctselect_with_pand:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: subl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT: .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT: .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %bl
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: sete %bh
+; I386-NOCMOV-NEXT: movb %bh, %cl
+; I386-NOCMOV-NEXT: movzbl %cl, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edx, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %eax, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: sete %dl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movb %dl, %dh
+; I386-NOCMOV-NEXT: movzbl %dh, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %esi, %ebx
+; I386-NOCMOV-NEXT: andl %ebp, %ebx
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %eax, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %ebx
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %dl
+; I386-NOCMOV-NEXT: testb %dl, %dl
+; I386-NOCMOV-NEXT: sete %dh
+; I386-NOCMOV-NEXT: movb %dh, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %ecx, %esi
+; I386-NOCMOV-NEXT: andl %ebp, %esi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %ebx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %esi
+; I386-NOCMOV-NEXT: testb %dl, %dl
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; I386-NOCMOV-NEXT: movb %al, %dl
+; I386-NOCMOV-NEXT: movzbl %dl, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %ebx, %ecx
+; I386-NOCMOV-NEXT: andl %esi, %ecx
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: orl %esi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT: pand %mm0, %mm0
+; I386-NOCMOV-NEXT: movq %mm0, (%esp)
+; I386-NOCMOV-NEXT: movl (%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: addl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: popl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_mmx_nested_ctselect_with_pand:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %ebx
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-CMOV-NEXT: subl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 32
+; I386-CMOV-NEXT: .cfi_offset %esi, -12
+; I386-CMOV-NEXT: .cfi_offset %ebx, -8
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %bl
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %bh
+; I386-CMOV-NEXT: testb %bh, %bh
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %esi
+; I386-CMOV-NEXT: testb %bl, %bl
+; I386-CMOV-NEXT: cmovnel %esi, %edx
+; I386-CMOV-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel %ecx, %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT: pand %mm0, %mm0
+; I386-CMOV-NEXT: movq %mm0, (%esp)
+; I386-CMOV-NEXT: movl (%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: addl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-CMOV-NEXT: popl %ebx
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT: retl
+ %mmx_a = bitcast i64 %a to <1 x i64>
+ %mmx_b = bitcast i64 %b to <1 x i64>
+ %mmx_c = bitcast i64 %c to <1 x i64>
+ %cmp1 = icmp ne i32 %cond1, 0
+ %cmp2 = icmp ne i32 %cond2, 0
+ %sel1 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp2, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+ %sel2 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp1, <1 x i64> %sel1, <1 x i64> %mmx_c)
+ %result = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %sel2, <1 x i64> %sel2)
+ ret <1 x i64> %result
+}
+
+; Test MMX ct.select with por intrinsic
+define <1 x i64> @test_mmx_ctselect_with_por(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_por:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: subl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT: .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT: .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %bl
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: sete %bh
+; I386-NOCMOV-NEXT: movb %bh, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %esi, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %ecx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %ecx
+; I386-NOCMOV-NEXT: andl %esi, %ecx
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: orl %esi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT: por %mm0, %mm0
+; I386-NOCMOV-NEXT: movq %mm0, (%esp)
+; I386-NOCMOV-NEXT: movl (%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: addl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: popl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_por:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: subl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %dl
+; I386-CMOV-NEXT: testb %dl, %dl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT: por %mm0, %mm0
+; I386-CMOV-NEXT: movq %mm0, (%esp)
+; I386-CMOV-NEXT: movl (%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: addl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT: retl
+ %mmx_a = bitcast i64 %a to <1 x i64>
+ %mmx_b = bitcast i64 %b to <1 x i64>
+ %cmp = icmp ne i32 %cond, 0
+ %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+ %result = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %sel, <1 x i64> %sel)
+ ret <1 x i64> %result
+}
+
+; Declare MMX intrinsics
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>)
+
+; Declare constant-time selection intrinsic
+declare <1 x i64> @llvm.ct.select.v1i64(i1, <1 x i64>, <1 x i64>)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386.ll b/llvm/test/CodeGen/X86/ctselect-i386.ll
new file mode 100644
index 0000000000000..d1cc559f0c1c1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Comprehensive CT_SELECT tests for i386 targets with scalar integer types
+; - Without CMOV: constant-time implementation using post-RA expansion with bundled instructions
+; - With CMOV: CMOV-based implementation
+; - Verifies security properties: no conditional branches, constant execution time
+; All expansion happens post-RA for better optimization control and constant-time guarantees
+
+; Test basic i32 functionality
+define i32 @test_ctselect_i32_basic(i1 %cond, i32 %a, i32 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i32_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %eax
+; I386-NOCMOV-NEXT: andl %esi, %eax
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %ecx, %esi
+; I386-NOCMOV-NEXT: orl %esi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_i32_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: retl
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test i16 functionality
+define i16 @test_ctselect_i16_basic(i1 %cond, i16 %a, i16 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i16_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbw %bh, %si
+; I386-NOCMOV-NEXT: negw %si
+; I386-NOCMOV-NEXT: movw %dx, %ax
+; I386-NOCMOV-NEXT: andw %si, %ax
+; I386-NOCMOV-NEXT: notw %si
+; I386-NOCMOV-NEXT: andw %cx, %si
+; I386-NOCMOV-NEXT: orw %si, %ax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_i16_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnew {{[0-9]+}}(%esp), %ax
+; I386-CMOV-NEXT: retl
+ %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+ ret i16 %result
+}
+
+; Test i8 functionality
+define i8 @test_ctselect_i8_basic(i1 %cond, i8 %a, i8 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i8_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %ah
+; I386-NOCMOV-NEXT: movb %ah, %ch
+; I386-NOCMOV-NEXT: negb %ch
+; I386-NOCMOV-NEXT: movb %dl, %al
+; I386-NOCMOV-NEXT: andb %ch, %al
+; I386-NOCMOV-NEXT: notb %ch
+; I386-NOCMOV-NEXT: andb %cl, %ch
+; I386-NOCMOV-NEXT: orb %ch, %al
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_i8_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: # kill: def $al killed $al killed $eax
+; I386-CMOV-NEXT: retl
+ %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+ ret i8 %result
+}
+
+; Test security property: constant-time execution for cryptographic use case
+define i32 @test_crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwind {
+; I386-NOCMOV-LABEL: test_crypto_key_select:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %al
+; I386-NOCMOV-NEXT: testb %al, %al
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %eax
+; I386-NOCMOV-NEXT: andl %esi, %eax
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %ecx, %esi
+; I386-NOCMOV-NEXT: orl %esi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_crypto_key_select:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %cl
+; I386-CMOV-NEXT: testb %cl, %cl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: retl
+ %cond = icmp ne i32 %secret_bit, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %key1, i32 %key2)
+ ret i32 %result
+}
+
+; Test that no conditional branches appear in constant-time path
+define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) nounwind {
+; I386-NOCMOV-LABEL: test_no_conditional_branches:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %al
+; I386-NOCMOV-NEXT: testb %al, %al
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %eax
+; I386-NOCMOV-NEXT: andl %esi, %eax
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %ecx, %esi
+; I386-NOCMOV-NEXT: orl %esi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_no_conditional_branches:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %cl
+; I386-CMOV-NEXT: testb %cl, %cl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: retl
+ %cond = icmp ne i32 %secret, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2)
+ ret i32 %result
+}
+
+; Test with comparison condition
+define i32 @test_ctselect_i32_cmp(i32 %a, i32 %b, i32 %c) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i32_cmp:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: cmpl %edx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: testb %al, %al
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %eax
+; I386-NOCMOV-NEXT: andl %esi, %eax
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %ecx, %esi
+; I386-NOCMOV-NEXT: orl %esi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_i32_cmp:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %cl
+; I386-CMOV-NEXT: testb %cl, %cl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: retl
+ %cond = icmp eq i32 %a, %c
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c)
+ ret i32 %result
+}
+
+; Test nested selects
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_nested:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %eax, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %dl
+; I386-NOCMOV-NEXT: movb %dl, %dh
+; I386-NOCMOV-NEXT: movzbl %dh, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %ecx, %eax
+; I386-NOCMOV-NEXT: andl %edi, %eax
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %esi, %edi
+; I386-NOCMOV-NEXT: orl %edi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_nested:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel %ecx, %eax
+; I386-CMOV-NEXT: retl
+ %sel1 = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+ %sel2 = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %sel1, i32 %c)
+ ret i32 %sel2
+}
+
+; Declare ct.select intrinsics
+declare i8 @llvm.ct.select.i8(i1, i8, i8)
+declare i16 @llvm.ct.select.i16(i1, i16, i16)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
diff --git a/llvm/test/CodeGen/X86/ctselect-optimization.ll b/llvm/test/CodeGen/X86/ctselect-optimization.ll
new file mode 100644
index 0000000000000..481d49971a937
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-optimization.ll
@@ -0,0 +1,304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s
+
+; Test ct.select optimization patterns
+
+; Test smin(x, 0) pattern optimization
+define i32 @test_ctselect_smin_zero(i32 %x) {
+; CHECK-LABEL: test_ctselect_smin_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+ ret i32 %result
+}
+
+; Test smax(x, 0) pattern optimization
+define i32 @test_ctselect_smax_zero(i32 %x) {
+; CHECK-LABEL: test_ctselect_smax_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: setg %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp sgt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+ ret i32 %result
+}
+
+; Test generic smin pattern
+define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_smin_generic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setl %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+}
+
+; Test generic smax pattern
+define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_smax_generic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setg %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp sgt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+}
+
+; Test umin pattern
+define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_umin_generic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setb %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ult i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+}
+
+; Test umax pattern
+define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_umax_generic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: seta %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ugt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+}
+
+; Test abs pattern
+define i32 @test_ctselect_abs(i32 %x) {
+; CHECK-LABEL: test_ctselect_abs:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: negl %ecx
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %dl
+; CHECK-NEXT: testb %dl, %dl
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %neg = sub i32 0, %x
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x)
+ ret i32 %result
+}
+
+; Test nabs pattern (negative abs)
+define i32 @test_ctselect_nabs(i32 %x) {
+; CHECK-LABEL: test_ctselect_nabs:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: negl %eax
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %neg = sub i32 0, %x
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg)
+ ret i32 %result
+}
+
+; Test sign extension pattern
+define i32 @test_ctselect_sign_extend(i32 %x) {
+; CHECK-LABEL: test_ctselect_sign_extend:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: movl $-1, %ecx
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+ ret i32 %result
+}
+
+; Test zero extension pattern
+define i32 @test_ctselect_zero_extend(i32 %x) {
+; CHECK-LABEL: test_ctselect_zero_extend:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: movl $1, %ecx
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ne i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0)
+ ret i32 %result
+}
+
+; Test mask generation pattern
+define i32 @test_ctselect_mask_generation(i32 %x) {
+; CHECK-LABEL: test_ctselect_mask_generation:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: movl $-1, %ecx
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+ ret i32 %result
+}
+
+; Test constant folding with known condition
+define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_constant_folding_true:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movb $1, %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_constant_folding_false:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test with identical operands
+define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+; CHECK-LABEL: test_ctselect_identical_operands:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: cmovnel %esi, %eax
+; CHECK-NEXT: retq
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x)
+ ret i32 %result
+}
+
+; Test with inverted condition
+define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_inverted_condition:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: sete %dl
+; CHECK-NEXT: testb %dl, %dl
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp eq i32 %x, %y
+ %not_cmp = xor i1 %cmp, true
+ %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test for 64-bit specific optimizations
+define i64 @test_ctselect_i64_smin_zero(i64 %x) {
+; CHECK-LABEL: test_ctselect_i64_smin_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testq %rdi, %rdi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovneq %rdi, %rax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i64 %x, 0
+ %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0)
+ ret i64 %result
+}
+
+; Test for floating point optimizations
+define float @test_ctselect_f32_zero_positive(float %x) {
+; CHECK-LABEL: test_ctselect_f32_zero_positive:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
+; CHECK-NEXT: seta %cl
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %eax, %edx
+; CHECK-NEXT: movd %edx, %xmm0
+; CHECK-NEXT: retq
+ %cmp = fcmp ogt float %x, 0.0
+ %result = call float @llvm.ct.select.f32(i1 %cmp, float %x, float 0.0)
+ ret float %result
+}
+
+define double @test_ctselect_f64_zero_positive(double %x) {
+; CHECK-LABEL: test_ctselect_f64_zero_positive:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %xmm0, %rax
+; CHECK-NEXT: xorpd %xmm1, %xmm1
+; CHECK-NEXT: ucomisd %xmm1, %xmm0
+; CHECK-NEXT: seta %cl
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovneq %rax, %rdx
+; CHECK-NEXT: movq %rdx, %xmm0
+; CHECK-NEXT: retq
+ %cmp = fcmp ogt double %x, 0.0
+ %result = call double @llvm.ct.select.f64(i1 %cmp, double %x, double 0.0)
+ ret double %result
+}
+
+; Test chain of ct.select operations
+define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: test_ctselect_chain:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: cmovnel %ecx, %r8d
+; CHECK-NEXT: testb $1, %sil
+; CHECK-NEXT: cmovnel %r8d, %r9d
+; CHECK-NEXT: testb $1, %dl
+; CHECK-NEXT: cmovnel %r9d, %eax
+; CHECK-NEXT: retq
+ %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+ %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+ %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+ ret i32 %sel3
+}
+
+; Declare the intrinsics
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll
new file mode 100644
index 0000000000000..2206e32cd6d34
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-vector.ll
@@ -0,0 +1,1274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+
+; Test ct.select functionality for vector types
+
+; 128-bit vectors
+define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB0_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %xmm0, %xmm1
+; AVX512-NEXT: .LBB0_2:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %result
+}
+
+define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: test_ctselect_v4f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movaps %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB1_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %xmm0, %xmm1
+; AVX512-NEXT: .LBB1_2:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+ ret <4 x float> %result
+}
+
+define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v2i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v2i64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB2_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %xmm0, %xmm1
+; AVX512-NEXT: .LBB2_2:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b)
+ ret <2 x i64> %result
+}
+
+define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
+; SSE2-LABEL: test_ctselect_v2f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movapd %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB3_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovapd %xmm0, %xmm1
+; AVX512-NEXT: .LBB3_2:
+; AVX512-NEXT: vmovapd %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b)
+ ret <2 x double> %result
+}
+
+; 256-bit vectors
+define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v8i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v8i32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm3, %ymm3
+; AVX-NEXT: vmovd %eax, %ymm3
+; AVX-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm3, %ymm2
+; AVX-NEXT: pand %ymm0, %ymm3
+; AVX-NEXT: pandn %ymm1, %ymm2
+; AVX-NEXT: por %ymm3, %ymm2
+; AVX-NEXT: vmovaps %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm3, %ymm3
+; AVX2-NEXT: vmovd %eax, %ymm3
+; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm3, %ymm2
+; AVX2-NEXT: pand %ymm0, %ymm3
+; AVX2-NEXT: pandn %ymm1, %ymm2
+; AVX2-NEXT: por %ymm3, %ymm2
+; AVX2-NEXT: vmovaps %ymm2, %ymm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB4_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %ymm0, %ymm1
+; AVX512-NEXT: .LBB4_2:
+; AVX512-NEXT: vmovaps %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b)
+ ret <8 x i32> %result
+}
+
+define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: test_ctselect_v8f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: movaps %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v8f32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm3, %ymm3
+; AVX-NEXT: vmovd %eax, %ymm3
+; AVX-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm3, %ymm2
+; AVX-NEXT: pand %ymm0, %ymm3
+; AVX-NEXT: pandn %ymm1, %ymm2
+; AVX-NEXT: por %ymm3, %ymm2
+; AVX-NEXT: vmovaps %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v8f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm3, %ymm3
+; AVX2-NEXT: vmovd %eax, %ymm3
+; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm3, %ymm2
+; AVX2-NEXT: pand %ymm0, %ymm3
+; AVX2-NEXT: pandn %ymm1, %ymm2
+; AVX2-NEXT: por %ymm3, %ymm2
+; AVX2-NEXT: vmovaps %ymm2, %ymm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB5_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %ymm0, %ymm1
+; AVX512-NEXT: .LBB5_2:
+; AVX512-NEXT: vmovaps %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %result = call <8 x float> @llvm.ct.select.v8f32(i1 %cond, <8 x float> %a, <8 x float> %b)
+ ret <8 x float> %result
+}
+
+define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v4i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm3, %ymm3
+; AVX-NEXT: vmovd %eax, %ymm3
+; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm3, %ymm2
+; AVX-NEXT: pand %ymm0, %ymm3
+; AVX-NEXT: pandn %ymm1, %ymm2
+; AVX-NEXT: por %ymm3, %ymm2
+; AVX-NEXT: vmovaps %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm3, %ymm3
+; AVX2-NEXT: vmovd %eax, %ymm3
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm3, %ymm2
+; AVX2-NEXT: pand %ymm0, %ymm3
+; AVX2-NEXT: pandn %ymm1, %ymm2
+; AVX2-NEXT: por %ymm3, %ymm2
+; AVX2-NEXT: vmovaps %ymm2, %ymm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB6_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %ymm0, %ymm1
+; AVX512-NEXT: .LBB6_2:
+; AVX512-NEXT: vmovaps %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %result = call <4 x i64> @llvm.ct.select.v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b)
+ ret <4 x i64> %result
+}
+
+define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: test_ctselect_v4f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: movapd %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4f64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm3, %ymm3
+; AVX-NEXT: vmovd %eax, %ymm3
+; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm3, %ymm2
+; AVX-NEXT: pand %ymm0, %ymm3
+; AVX-NEXT: pandn %ymm1, %ymm2
+; AVX-NEXT: por %ymm3, %ymm2
+; AVX-NEXT: vmovaps %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm3, %ymm3
+; AVX2-NEXT: vmovd %eax, %ymm3
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm3, %ymm2
+; AVX2-NEXT: pand %ymm0, %ymm3
+; AVX2-NEXT: pandn %ymm1, %ymm2
+; AVX2-NEXT: por %ymm3, %ymm2
+; AVX2-NEXT: vmovaps %ymm2, %ymm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB7_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovapd %ymm0, %ymm1
+; AVX512-NEXT: .LBB7_2:
+; AVX512-NEXT: vmovapd %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %result = call <4 x double> @llvm.ct.select.v4f64(i1 %cond, <4 x double> %a, <4 x double> %b)
+ ret <4 x double> %result
+}
+
+; 512-bit vectors (AVX512 only)
+define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v16i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm9, %xmm8
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm8, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm2
+; SSE2-NEXT: movaps %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v16i32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm5, %ymm5
+; AVX-NEXT: vmovd %eax, %ymm5
+; AVX-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm5, %ymm4
+; AVX-NEXT: pand %ymm0, %ymm5
+; AVX-NEXT: pandn %ymm2, %ymm4
+; AVX-NEXT: por %ymm5, %ymm4
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm0, %ymm0
+; AVX-NEXT: vmovd %eax, %ymm0
+; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm0, %ymm2
+; AVX-NEXT: pand %ymm1, %ymm0
+; AVX-NEXT: pandn %ymm3, %ymm2
+; AVX-NEXT: por %ymm0, %ymm2
+; AVX-NEXT: vmovaps %ymm4, %ymm0
+; AVX-NEXT: vmovaps %ymm2, %ymm1
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v16i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm5, %ymm5
+; AVX2-NEXT: vmovd %eax, %ymm5
+; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: pand %ymm0, %ymm5
+; AVX2-NEXT: pandn %ymm2, %ymm4
+; AVX2-NEXT: por %ymm5, %ymm4
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm0, %ymm0
+; AVX2-NEXT: vmovd %eax, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-NEXT: pand %ymm1, %ymm0
+; AVX2-NEXT: pandn %ymm3, %ymm2
+; AVX2-NEXT: por %ymm0, %ymm2
+; AVX2-NEXT: vmovaps %ymm4, %ymm0
+; AVX2-NEXT: vmovaps %ymm2, %ymm1
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v16i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB8_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %zmm0, %zmm1
+; AVX512-NEXT: .LBB8_2:
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %result = call <16 x i32> @llvm.ct.select.v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b)
+ ret <16 x i32> %result
+}
+
+define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) {
+; SSE2-LABEL: test_ctselect_v16f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT: movaps %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm9, %xmm8
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm8, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm2
+; SSE2-NEXT: movaps %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v16f32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm5, %ymm5
+; AVX-NEXT: vmovd %eax, %ymm5
+; AVX-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm5, %ymm4
+; AVX-NEXT: pand %ymm0, %ymm5
+; AVX-NEXT: pandn %ymm2, %ymm4
+; AVX-NEXT: por %ymm5, %ymm4
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm0, %ymm0
+; AVX-NEXT: vmovd %eax, %ymm0
+; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm0, %ymm2
+; AVX-NEXT: pand %ymm1, %ymm0
+; AVX-NEXT: pandn %ymm3, %ymm2
+; AVX-NEXT: por %ymm0, %ymm2
+; AVX-NEXT: vmovaps %ymm4, %ymm0
+; AVX-NEXT: vmovaps %ymm2, %ymm1
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v16f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm5, %ymm5
+; AVX2-NEXT: vmovd %eax, %ymm5
+; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: pand %ymm0, %ymm5
+; AVX2-NEXT: pandn %ymm2, %ymm4
+; AVX2-NEXT: por %ymm5, %ymm4
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm0, %ymm0
+; AVX2-NEXT: vmovd %eax, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-NEXT: pand %ymm1, %ymm0
+; AVX2-NEXT: pandn %ymm3, %ymm2
+; AVX2-NEXT: por %ymm0, %ymm2
+; AVX2-NEXT: vmovaps %ymm4, %ymm0
+; AVX2-NEXT: vmovaps %ymm2, %ymm1
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v16f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB9_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %zmm0, %zmm1
+; AVX512-NEXT: .LBB9_2:
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %result = call <16 x float> @llvm.ct.select.v16f32(i1 %cond, <16 x float> %a, <16 x float> %b)
+ ret <16 x float> %result
+}
+
+define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v8i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm9, %xmm8
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm8, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm2
+; SSE2-NEXT: movaps %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v8i64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm5, %ymm5
+; AVX-NEXT: vmovd %eax, %ymm5
+; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm5, %ymm4
+; AVX-NEXT: pand %ymm0, %ymm5
+; AVX-NEXT: pandn %ymm2, %ymm4
+; AVX-NEXT: por %ymm5, %ymm4
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm0, %ymm0
+; AVX-NEXT: vmovd %eax, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm0, %ymm2
+; AVX-NEXT: pand %ymm1, %ymm0
+; AVX-NEXT: pandn %ymm3, %ymm2
+; AVX-NEXT: por %ymm0, %ymm2
+; AVX-NEXT: vmovaps %ymm4, %ymm0
+; AVX-NEXT: vmovaps %ymm2, %ymm1
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v8i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm5, %ymm5
+; AVX2-NEXT: vmovd %eax, %ymm5
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: pand %ymm0, %ymm5
+; AVX2-NEXT: pandn %ymm2, %ymm4
+; AVX2-NEXT: por %ymm5, %ymm4
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm0, %ymm0
+; AVX2-NEXT: vmovd %eax, %ymm0
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-NEXT: pand %ymm1, %ymm0
+; AVX2-NEXT: pandn %ymm3, %ymm2
+; AVX2-NEXT: por %ymm0, %ymm2
+; AVX2-NEXT: vmovaps %ymm4, %ymm0
+; AVX2-NEXT: vmovaps %ymm2, %ymm1
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v8i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB10_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %zmm0, %zmm1
+; AVX512-NEXT: .LBB10_2:
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %result = call <8 x i64> @llvm.ct.select.v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b)
+ ret <8 x i64> %result
+}
+
+define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) {
+; SSE2-LABEL: test_ctselect_v8f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT: movapd %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm9, %xmm8
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movapd %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movapd %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movapd %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm8, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm2
+; SSE2-NEXT: movaps %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v8f64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm5, %ymm5
+; AVX-NEXT: vmovd %eax, %ymm5
+; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm5, %ymm4
+; AVX-NEXT: pand %ymm0, %ymm5
+; AVX-NEXT: pandn %ymm2, %ymm4
+; AVX-NEXT: por %ymm5, %ymm4
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm0, %ymm0
+; AVX-NEXT: vmovd %eax, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm0, %ymm2
+; AVX-NEXT: pand %ymm1, %ymm0
+; AVX-NEXT: pandn %ymm3, %ymm2
+; AVX-NEXT: por %ymm0, %ymm2
+; AVX-NEXT: vmovaps %ymm4, %ymm0
+; AVX-NEXT: vmovaps %ymm2, %ymm1
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v8f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm5, %ymm5
+; AVX2-NEXT: vmovd %eax, %ymm5
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: pand %ymm0, %ymm5
+; AVX2-NEXT: pandn %ymm2, %ymm4
+; AVX2-NEXT: por %ymm5, %ymm4
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm0, %ymm0
+; AVX2-NEXT: vmovd %eax, %ymm0
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-NEXT: pand %ymm1, %ymm0
+; AVX2-NEXT: pandn %ymm3, %ymm2
+; AVX2-NEXT: por %ymm0, %ymm2
+; AVX2-NEXT: vmovaps %ymm4, %ymm0
+; AVX2-NEXT: vmovaps %ymm2, %ymm1
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v8f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB11_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovapd %zmm0, %zmm1
+; AVX512-NEXT: .LBB11_2:
+; AVX512-NEXT: vmovapd %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %result = call <8 x double> @llvm.ct.select.v8f64(i1 %cond, <8 x double> %a, <8 x double> %b)
+ ret <8 x double> %result
+}
+
+; Test with constant conditions for vector types
+define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_const_true:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movb $1, %al
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i32_const_true:
+; AVX: # %bb.0:
+; AVX-NEXT: movb $1, %al
+; AVX-NEXT: testb %al, %al
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_const_true:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movb $1, %al
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i32_const_true:
+; AVX512: # %bb.0:
+; AVX512-NEXT: retq
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_const_false:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i32_const_false:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: testb %al, %al
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_const_false:
+; AVX2: # %bb.0:
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i32_const_false:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 false, <4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %result
+}
+
+; Test with comparison conditions for vector types
+define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_icmp:
+; SSE2: # %bb.0:
+; SSE2-NEXT: cmpl %esi, %edi
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i32_icmp:
+; AVX: # %bb.0:
+; AVX-NEXT: cmpl %esi, %edi
+; AVX-NEXT: sete %al
+; AVX-NEXT: testb %al, %al
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_icmp:
+; AVX2: # %bb.0:
+; AVX2-NEXT: cmpl %esi, %edi
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i32_icmp:
+; AVX512: # %bb.0:
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: je .LBB14_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: .LBB14_2:
+; AVX512-NEXT: retq
+ %cond = icmp eq i32 %x, %y
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %result
+}
+
+; Declare the intrinsics
+declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
+declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
+declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
+declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
+declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.ct.select.v8f32(i1, <8 x float>, <8 x float>)
+declare <4 x i64> @llvm.ct.select.v4i64(i1, <4 x i64>, <4 x i64>)
+declare <4 x double> @llvm.ct.select.v4f64(i1, <4 x double>, <4 x double>)
+declare <16 x i32> @llvm.ct.select.v16i32(i1, <16 x i32>, <16 x i32>)
+declare <16 x float> @llvm.ct.select.v16f32(i1, <16 x float>, <16 x float>)
+declare <8 x i64> @llvm.ct.select.v8i64(i1, <8 x i64>, <8 x i64>)
+declare <8 x double> @llvm.ct.select.v8f64(i1, <8 x double>, <8 x double>)
diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
index e1abae80cef4f..d76ae0365f28c 100644
--- a/llvm/test/CodeGen/X86/ctselect.ll
+++ b/llvm/test/CodeGen/X86/ctselect.ll
@@ -8,77 +8,122 @@
define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
; X64-LABEL: test_ctselect_i8:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andb $1, %al
-; X64-NEXT: xorl %edx, %esi
-; X64-NEXT: negb %al
-; X64-NEXT: andb %sil, %al
-; X64-NEXT: xorb %dl, %al
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_i8:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andb $1, %al
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorb %cl, %dl
-; X32-NEXT: negb %al
-; X32-NEXT: andb %dl, %al
-; X32-NEXT: xorb %cl, %al
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: # kill: def $al killed $al killed $eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_i8:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andb $1, %al
; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorb %cl, %dl
-; X32-NOCMOV-NEXT: negb %al
-; X32-NOCMOV-NEXT: andb %dl, %al
-; X32-NOCMOV-NEXT: xorb %cl, %al
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %ah
+; X32-NOCMOV-NEXT: movb %ah, %ch
+; X32-NOCMOV-NEXT: negb %ch
+; X32-NOCMOV-NEXT: movb %dl, %al
+; X32-NOCMOV-NEXT: andb %ch, %al
+; X32-NOCMOV-NEXT: notb %ch
+; X32-NOCMOV-NEXT: andb %cl, %ch
+; X32-NOCMOV-NEXT: orb %ch, %al
; X32-NOCMOV-NEXT: retl
%result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
ret i8 %result
}
+define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+; X64-LABEL: test_ctselect_i16:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_i16:
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnew {{[0-9]+}}(%esp), %ax
+; X32-NEXT: retl
+;
+; X32-NOCMOV-LABEL: test_ctselect_i16:
+; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbw %bh, %si
+; X32-NOCMOV-NEXT: negw %si
+; X32-NOCMOV-NEXT: movw %dx, %ax
+; X32-NOCMOV-NEXT: andw %si, %ax
+; X32-NOCMOV-NEXT: notw %si
+; X32-NOCMOV-NEXT: andw %cx, %si
+; X32-NOCMOV-NEXT: orw %si, %ax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; X32-NOCMOV-NEXT: retl
+ %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+ ret i16 %result
+}
+
define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_i32:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: xorl %edx, %esi
-; X64-NEXT: andl $1, %eax
-; X64-NEXT: negl %eax
-; X64-NEXT: andl %esi, %eax
-; X64-NEXT: xorl %edx, %eax
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_i32:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andb $1, %al
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: negl %eax
-; X32-NEXT: andl %edx, %eax
-; X32-NEXT: xorl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_i32:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andb $1, %al
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
-; X32-NOCMOV-NEXT: movzbl %al, %eax
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: andl %edx, %eax
-; X32-NOCMOV-NEXT: xorl %ecx, %eax
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
ret i32 %result
@@ -87,67 +132,66 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
; X64-LABEL: test_ctselect_i64:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: xorq %rdx, %rsi
-; X64-NEXT: andl $1, %eax
-; X64-NEXT: negq %rax
-; X64-NEXT: andq %rsi, %rax
-; X64-NEXT: xorq %rdx, %rax
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_i64:
; X32: # %bb.0:
-; X32-NEXT: pushl %edi
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: pushl %esi
-; X32-NEXT: .cfi_def_cfa_offset 12
-; X32-NEXT: .cfi_offset %esi, -12
-; X32-NEXT: .cfi_offset %edi, -8
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: andb $1, %dl
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: xorl %esi, %eax
-; X32-NEXT: movzbl %dl, %edi
-; X32-NEXT: negl %edi
-; X32-NEXT: andl %edi, %eax
-; X32-NEXT: xorl %esi, %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: andl %edi, %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: popl %esi
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: popl %edi
-; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_i64:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %edi
+; X32-NOCMOV-NEXT: pushl %ebp
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: pushl %ebx
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
-; X32-NOCMOV-NEXT: .cfi_offset %edi, -8
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: andb $1, %dl
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: xorl %esi, %eax
-; X32-NOCMOV-NEXT: movzbl %dl, %edi
-; X32-NOCMOV-NEXT: negl %edi
-; X32-NOCMOV-NEXT: andl %edi, %eax
-; X32-NOCMOV-NEXT: xorl %esi, %eax
+; X32-NOCMOV-NEXT: pushl %edi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
-; X32-NOCMOV-NEXT: andl %edi, %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT: testb $1, %bl
+; X32-NOCMOV-NEXT: sete %bh
+; X32-NOCMOV-NEXT: movb %bh, %cl
+; X32-NOCMOV-NEXT: movzbl %cl, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ebp, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: testb $1, %bl
+; X32-NOCMOV-NEXT: sete %cl
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT: movb %cl, %ch
+; X32-NOCMOV-NEXT: movzbl %ch, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edi, %edx
+; X32-NOCMOV-NEXT: andl %esi, %edx
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ebx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %edx
; X32-NOCMOV-NEXT: popl %esi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
; X32-NOCMOV-NEXT: popl %edi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebp
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
@@ -157,59 +201,74 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
; X64-LABEL: test_ctselect_f32:
; X64: # %bb.0:
-; X64-NEXT: movd %xmm1, %eax
-; X64-NEXT: pxor %xmm1, %xmm0
-; X64-NEXT: movd %xmm0, %ecx
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: negl %edi
-; X64-NEXT: andl %ecx, %edi
-; X64-NEXT: xorl %eax, %edi
-; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movd %xmm1, %ecx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %eax, %ecx
+; X64-NEXT: movd %ecx, %xmm0
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_f32:
; X32: # %bb.0:
-; X32-NEXT: subl $12, %esp
+; X32-NEXT: pushl %edi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NEXT: fstps {{[0-9]+}}(%esp)
-; X32-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NEXT: fstps (%esp)
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andb $1, %al
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: sete %al
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: negl %eax
-; X32-NEXT: movl (%esp), %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: andl %eax, %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NEXT: addl $12, %esp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_f32:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: subl $12, %esp
+; X32-NOCMOV-NEXT: pushl %edi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: pushl %eax
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: fstps (%esp)
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andb $1, %al
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %al
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movzbl %al, %eax
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: movl (%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
-; X32-NOCMOV-NEXT: andl %eax, %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
-; X32-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: addl $12, %esp
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movb %al, %ah
+; X32-NOCMOV-NEXT: movzbl %ah, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %ecx, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: movl %esi, (%esp)
+; X32-NOCMOV-NEXT: flds (%esp)
+; X32-NOCMOV-NEXT: addl $4, %esp
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
@@ -219,72 +278,96 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
; X64-LABEL: test_ctselect_f64:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: movq %xmm1, %rax
-; X64-NEXT: pxor %xmm1, %xmm0
-; X64-NEXT: movq %xmm0, %rcx
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: negq %rdi
-; X64-NEXT: andq %rcx, %rdi
-; X64-NEXT: xorq %rax, %rdi
-; X64-NEXT: movq %rdi, %xmm0
+; X64-NEXT: movq %xmm0, %rax
+; X64-NEXT: movq %xmm1, %rcx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rax, %rcx
+; X64-NEXT: movq %rcx, %xmm0
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_f64:
; X32: # %bb.0:
-; X32-NEXT: pushl %esi
+; X32-NEXT: pushl %edi
; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: subl $8, %esp
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: .cfi_offset %esi, -8
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: .cfi_def_cfa_offset 20
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: sete %al
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: xorl %edx, %esi
-; X32-NEXT: andl $1, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl %ecx, %esi
-; X32-NEXT: xorl %edx, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %eax, %edx
-; X32-NEXT: andl %ecx, %edx
-; X32-NEXT: xorl %eax, %edx
-; X32-NEXT: movl %edx, (%esp)
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X32-NEXT: fldl (%esp)
; X32-NEXT: addl $8, %esp
-; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_f64:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: pushl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
; X32-NOCMOV-NEXT: subl $8, %esp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT: .cfi_offset %esi, -8
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %al
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: xorl %edx, %esi
-; X32-NOCMOV-NEXT: andl $1, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl %ecx, %esi
-; X32-NOCMOV-NEXT: xorl %edx, %esi
-; X32-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: movb %al, %ah
+; X32-NOCMOV-NEXT: movzbl %ah, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %ecx, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: movl %esi, (%esp)
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %eax, %edx
-; X32-NOCMOV-NEXT: andl %ecx, %edx
-; X32-NOCMOV-NEXT: xorl %eax, %edx
-; X32-NOCMOV-NEXT: movl %edx, (%esp)
+; X32-NOCMOV-NEXT: movb %al, %ah
+; X32-NOCMOV-NEXT: movzbl %ah, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %ecx, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X32-NOCMOV-NEXT: fldl (%esp)
; X32-NOCMOV-NEXT: addl $8, %esp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
@@ -294,38 +377,42 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
; X64-LABEL: test_ctselect_ptr:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: xorq %rdx, %rsi
-; X64-NEXT: andl $1, %eax
-; X64-NEXT: negq %rax
-; X64-NEXT: andq %rsi, %rax
-; X64-NEXT: xorq %rdx, %rax
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_ptr:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andb $1, %al
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: negl %eax
-; X32-NEXT: andl %edx, %eax
-; X32-NEXT: xorl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_ptr:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andb $1, %al
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
-; X32-NOCMOV-NEXT: movzbl %al, %eax
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: andl %edx, %eax
-; X32-NOCMOV-NEXT: xorl %ecx, %eax
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
ret ptr %result
@@ -335,17 +422,45 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_const_true:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: movb $1, %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_const_true:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movb $1, %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_const_true:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movb $1, %al
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
ret i32 %result
@@ -355,16 +470,44 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_const_false:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %eax
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_const_false:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_const_false:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: xorl %eax, %eax
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
ret i32 %result
@@ -374,1151 +517,429 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_icmp_eq:
; X64: # %bb.0:
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cmpl %esi, %edi
-; X64-NEXT: sete %al
-; X64-NEXT: xorl %ecx, %edx
-; X64-NEXT: negl %eax
-; X64-NEXT: andl %edx, %eax
-; X64-NEXT: xorl %ecx, %eax
+; X64-NEXT: sete %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_icmp_eq:
; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %eax, %eax
-; X32-NEXT: cmpl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: sete %al
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: negl %eax
-; X32-NEXT: andl %edx, %eax
-; X32-NEXT: xorl %ecx, %eax
+; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: sete %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_icmp_eq:
; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %eax, %eax
-; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X32-NOCMOV-NEXT: sete %al
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: andl %edx, %eax
-; X32-NOCMOV-NEXT: xorl %ecx, %eax
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%cond = icmp eq i32 %x, %y
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
ret i32 %result
}
-define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
-; X64-LABEL: test_ctselect_icmp_ult:
+define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_ne:
; X64: # %bb.0:
-; X64-NEXT: xorl %ecx, %edx
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cmpl %esi, %edi
-; X64-NEXT: sbbl %eax, %eax
-; X64-NEXT: andl %edx, %eax
-; X64-NEXT: xorl %ecx, %eax
+; X64-NEXT: setne %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
;
-; X32-LABEL: test_ctselect_icmp_ult:
+; X32-LABEL: test_ctselect_icmp_ne:
; X32: # %bb.0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: xorl %edx, %edx
-; X32-NEXT: cmpl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: sbbl %edx, %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: xorl %ecx, %eax
-; X32-NEXT: andl %edx, %eax
-; X32-NEXT: xorl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: setne %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
-; X32-NOCMOV-LABEL: test_ctselect_icmp_ult:
+; X32-NOCMOV-LABEL: test_ctselect_icmp_ne:
; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: xorl %edx, %edx
; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: sbbl %edx, %edx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: xorl %ecx, %eax
-; X32-NOCMOV-NEXT: andl %edx, %eax
-; X32-NOCMOV-NEXT: xorl %ecx, %eax
+; X32-NOCMOV-NEXT: setne %al
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
- %cond = icmp ult i32 %x, %y
+ %cond = icmp ne i32 %x, %y
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
ret i32 %result
}
-define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
-; X64-LABEL: test_ctselect_fcmp_oeq:
+define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_slt:
; X64: # %bb.0:
-; X64-NEXT: cmpeqss %xmm1, %xmm0
-; X64-NEXT: xorps %xmm3, %xmm2
-; X64-NEXT: andps %xmm2, %xmm0
-; X64-NEXT: xorps %xmm3, %xmm0
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: cmpl %esi, %edi
+; X64-NEXT: setl %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
;
-; X32-LABEL: test_ctselect_fcmp_oeq:
+; X32-LABEL: test_ctselect_icmp_slt:
; X32: # %bb.0:
-; X32-NEXT: subl $12, %esp
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NEXT: fstps {{[0-9]+}}(%esp)
-; X32-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NEXT: fstps (%esp)
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NEXT: fucompi %st(1), %st
-; X32-NEXT: fstp %st(0)
-; X32-NEXT: setnp %cl
-; X32-NEXT: sete %dl
-; X32-NEXT: andb %cl, %dl
-; X32-NEXT: movzbl %dl, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: movl (%esp), %edx
-; X32-NEXT: xorl %eax, %edx
-; X32-NEXT: andl %ecx, %edx
-; X32-NEXT: xorl %eax, %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NEXT: addl $12, %esp
-; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: setl %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
-; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq:
+; X32-NOCMOV-LABEL: test_ctselect_icmp_slt:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: subl $12, %esp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: fstps (%esp)
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: fucompp
-; X32-NOCMOV-NEXT: fnstsw %ax
-; X32-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax
-; X32-NOCMOV-NEXT: sahf
-; X32-NOCMOV-NEXT: setnp %al
-; X32-NOCMOV-NEXT: sete %dl
-; X32-NOCMOV-NEXT: andb %al, %dl
-; X32-NOCMOV-NEXT: movzbl %dl, %eax
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: movl (%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
-; X32-NOCMOV-NEXT: andl %eax, %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
-; X32-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: addl $12, %esp
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: setl %al
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
- %cond = fcmp oeq float %x, %y
- %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
- ret float %result
+ %cond = icmp slt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
}
-; Test with memory operands
-define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
-; X64-LABEL: test_ctselect_load:
+define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+; X64-LABEL: test_ctselect_icmp_ult:
; X64: # %bb.0:
-; X64-NEXT: movl (%rdx), %ecx
-; X64-NEXT: movl (%rsi), %eax
-; X64-NEXT: xorl %ecx, %eax
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: negl %edi
-; X64-NEXT: andl %edi, %eax
-; X64-NEXT: xorl %ecx, %eax
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: cmpl %esi, %edi
+; X64-NEXT: setb %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
;
-; X32-LABEL: test_ctselect_load:
+; X32-LABEL: test_ctselect_icmp_ult:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andb $1, %al
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl (%edx), %edx
-; X32-NEXT: movl (%ecx), %ecx
-; X32-NEXT: xorl %edx, %ecx
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: negl %eax
-; X32-NEXT: andl %ecx, %eax
-; X32-NEXT: xorl %edx, %eax
+; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: setb %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
-; X32-NOCMOV-LABEL: test_ctselect_load:
+; X32-NOCMOV-LABEL: test_ctselect_icmp_ult:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andb $1, %al
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: movl (%edx), %edx
-; X32-NOCMOV-NEXT: movl (%ecx), %ecx
-; X32-NOCMOV-NEXT: xorl %edx, %ecx
-; X32-NOCMOV-NEXT: movzbl %al, %eax
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: andl %ecx, %eax
-; X32-NOCMOV-NEXT: xorl %edx, %eax
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: setb %al
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
- %a = load i32, ptr %p1
- %b = load i32, ptr %p2
+ %cond = icmp ult i32 %x, %y
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
ret i32 %result
}
-; Test nested ct_select calls
-define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
-; X64-LABEL: test_ctselect_nested:
+define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
+; X64-LABEL: test_ctselect_fcmp_oeq:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: xorl %ecx, %edx
-; X64-NEXT: andl $1, %esi
-; X64-NEXT: negl %esi
-; X64-NEXT: andl %edx, %esi
-; X64-NEXT: xorl %r8d, %ecx
-; X64-NEXT: xorl %esi, %ecx
-; X64-NEXT: andl $1, %eax
-; X64-NEXT: negl %eax
-; X64-NEXT: andl %ecx, %eax
-; X64-NEXT: xorl %r8d, %eax
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: movd %xmm3, %ecx
+; X64-NEXT: ucomiss %xmm1, %xmm0
+; X64-NEXT: setnp %dl
+; X64-NEXT: sete %sil
+; X64-NEXT: testb %dl, %sil
+; X64-NEXT: cmovnel %eax, %ecx
+; X64-NEXT: movd %ecx, %xmm0
; X64-NEXT: retq
;
-; X32-LABEL: test_ctselect_nested:
+; X32-LABEL: test_ctselect_fcmp_oeq:
; X32: # %bb.0:
; X32-NEXT: pushl %edi
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: pushl %esi
; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: .cfi_offset %esi, -12
; X32-NEXT: .cfi_offset %edi, -8
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andb $1, %al
-; X32-NEXT: movb {{[0-9]+}}(%esp), %ah
-; X32-NEXT: andb $1, %ah
+; X32-NEXT: flds {{[0-9]+}}(%esp)
+; X32-NEXT: flds {{[0-9]+}}(%esp)
+; X32-NEXT: fucompi %st(1), %st
+; X32-NEXT: fstp %st(0)
+; X32-NEXT: setnp %al
+; X32-NEXT: sete %cl
+; X32-NEXT: testb %al, %cl
+; X32-NEXT: sete %al
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: xorl %edx, %esi
+; X32-NEXT: movb %al, %ah
; X32-NEXT: movzbl %ah, %edi
; X32-NEXT: negl %edi
-; X32-NEXT: andl %esi, %edi
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: xorl %edi, %edx
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: negl %eax
-; X32-NEXT: andl %edx, %eax
-; X32-NEXT: xorl %ecx, %eax
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: popl %esi
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: popl %edi
; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
-; X32-NOCMOV-LABEL: test_ctselect_nested:
+; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq:
; X32-NOCMOV: # %bb.0:
; X32-NOCMOV-NEXT: pushl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
; X32-NOCMOV-NEXT: pushl %esi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: pushl %eax
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
; X32-NOCMOV-NEXT: .cfi_offset %edi, -8
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andb $1, %al
-; X32-NOCMOV-NEXT: movb {{[0-9]+}}(%esp), %ah
-; X32-NOCMOV-NEXT: andb $1, %ah
+; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: fucompp
+; X32-NOCMOV-NEXT: fnstsw %ax
+; X32-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax
+; X32-NOCMOV-NEXT: sahf
+; X32-NOCMOV-NEXT: setnp %al
+; X32-NOCMOV-NEXT: sete %cl
+; X32-NOCMOV-NEXT: testb %al, %cl
+; X32-NOCMOV-NEXT: sete %al
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: xorl %edx, %esi
+; X32-NOCMOV-NEXT: movb %al, %ah
; X32-NOCMOV-NEXT: movzbl %ah, %edi
; X32-NOCMOV-NEXT: negl %edi
-; X32-NOCMOV-NEXT: andl %esi, %edi
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
-; X32-NOCMOV-NEXT: xorl %edi, %edx
-; X32-NOCMOV-NEXT: movzbl %al, %eax
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: andl %edx, %eax
-; X32-NOCMOV-NEXT: xorl %ecx, %eax
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %ecx, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: movl %esi, (%esp)
+; X32-NOCMOV-NEXT: flds (%esp)
+; X32-NOCMOV-NEXT: addl $4, %esp
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
; X32-NOCMOV-NEXT: popl %esi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
; X32-NOCMOV-NEXT: popl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
- %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
- %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
- ret i32 %result
-}
-
-; Test nested CT_SELECT pattern with AND merging on i1 values
-; Pattern: ct_select C0, (ct_select C1, X, Y), Y -> ct_select (C0 & C1), X, Y
-; This optimization only applies when selecting between i1 values (boolean logic)
-define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
-; X64-LABEL: test_ctselect_nested_and_i1_to_i32:
-; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl %esi, %eax
-; X64-NEXT: xorl %ecx, %edx
-; X64-NEXT: andl $1, %eax
-; X64-NEXT: negl %eax
-; X64-NEXT: andl %edx, %eax
-; X64-NEXT: xorl %ecx, %eax
-; X64-NEXT: retq
-;
-; X32-LABEL: test_ctselect_nested_and_i1_to_i32:
-; X32: # %bb.0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $1, %al
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: negl %eax
-; X32-NEXT: andl %edx, %eax
-; X32-NEXT: xorl %ecx, %eax
-; X32-NEXT: retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_nested_and_i1_to_i32:
-; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT: andb $1, %al
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
-; X32-NOCMOV-NEXT: movzbl %al, %eax
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: andl %edx, %eax
-; X32-NOCMOV-NEXT: xorl %ecx, %eax
-; X32-NOCMOV-NEXT: retl
- %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
- %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner, i1 false)
- %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
- ret i32 %result
-}
-
-; Test nested CT_SELECT pattern with OR merging on i1 values
-; Pattern: ct_select C0, X, (ct_select C1, X, Y) -> ct_select (C0 | C1), X, Y
-; This optimization only applies when selecting between i1 values (boolean logic)
-define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
-; X64-LABEL: test_ctselect_nested_or_i1_to_i32:
-; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: orl %esi, %eax
-; X64-NEXT: xorl %ecx, %edx
-; X64-NEXT: andl $1, %eax
-; X64-NEXT: negl %eax
-; X64-NEXT: andl %edx, %eax
-; X64-NEXT: xorl %ecx, %eax
-; X64-NEXT: retq
-;
-; X32-LABEL: test_ctselect_nested_or_i1_to_i32:
-; X32: # %bb.0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $1, %al
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: negl %eax
-; X32-NEXT: andl %edx, %eax
-; X32-NEXT: xorl %ecx, %eax
-; X32-NEXT: retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_nested_or_i1_to_i32:
-; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT: andb $1, %al
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
-; X32-NOCMOV-NEXT: movzbl %al, %eax
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: andl %edx, %eax
-; X32-NOCMOV-NEXT: xorl %ecx, %eax
-; X32-NOCMOV-NEXT: retl
- %inner = call i1 @llvm.ct.select.i1(i1 %c1, i1 true, i1 false)
- %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 true, i1 %inner)
- %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
- ret i32 %result
+ %cond = fcmp oeq float %x, %y
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+ ret float %result
}
-; Test double nested CT_SELECT with recursive AND merging
-; Pattern: ct_select C0, (ct_select C1, (ct_select C2, X, Y), Y), Y
-; -> ct_select C0, (ct_select (C1 & C2), X, Y), Y
-; -> ct_select (C0 & (C1 & C2)), X, Y
-; This tests that the optimization can be applied recursively
-define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) {
-; X64-LABEL: test_ctselect_double_nested_and_i1:
+; Test with memory operands
+define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_load:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: andl %esi, %eax
-; X64-NEXT: andl %edx, %eax
-; X64-NEXT: xorl %r8d, %ecx
-; X64-NEXT: andl $1, %eax
-; X64-NEXT: negl %eax
-; X64-NEXT: andl %ecx, %eax
-; X64-NEXT: xorl %r8d, %eax
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel (%rsi), %eax
; X64-NEXT: retq
;
-; X32-LABEL: test_ctselect_double_nested_and_i1:
+; X32-LABEL: test_ctselect_load:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb {{[0-9]+}}(%esp), %al
-; X32-NEXT: andb $1, %al
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: negl %eax
-; X32-NEXT: andl %edx, %eax
-; X32-NEXT: xorl %ecx, %eax
-; X32-NEXT: retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_double_nested_and_i1:
-; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT: andb $1, %al
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
-; X32-NOCMOV-NEXT: movzbl %al, %eax
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: andl %edx, %eax
-; X32-NOCMOV-NEXT: xorl %ecx, %eax
-; X32-NOCMOV-NEXT: retl
- %inner2 = call i1 @llvm.ct.select.i1(i1 %c2, i1 true, i1 false)
- %inner1 = call i1 @llvm.ct.select.i1(i1 %c1, i1 %inner2, i1 false)
- %cond = call i1 @llvm.ct.select.i1(i1 %c0, i1 %inner1, i1 false)
- %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %y)
- ret i32 %result
-}
-
-; Vector CT_SELECT Tests
-; ============================================================================
-
-; Test vector CT_SELECT with v4i32 (128-bit vector with single i1 mask)
-; NOW CONSTANT-TIME: Uses bitwise XOR/AND operations instead of branches!
-define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
-; X64-LABEL: test_ctselect_v4i32:
-; X64: # %bb.0:
-; X64-NEXT: pxor %xmm1, %xmm0
-; X64-NEXT: movd %edi, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X64-NEXT: pslld $31, %xmm2
-; X64-NEXT: psrad $31, %xmm2
-; X64-NEXT: pand %xmm2, %xmm0
-; X64-NEXT: pxor %xmm1, %xmm0
-; X64-NEXT: retq
-;
-; X32-LABEL: test_ctselect_v4i32:
-; X32: # %bb.0:
-; X32-NEXT: pushl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: pushl %ebx
-; X32-NEXT: .cfi_def_cfa_offset 12
-; X32-NEXT: pushl %edi
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: pushl %esi
-; X32-NEXT: .cfi_def_cfa_offset 20
-; X32-NEXT: .cfi_offset %esi, -20
-; X32-NEXT: .cfi_offset %edi, -16
-; X32-NEXT: .cfi_offset %ebx, -12
-; X32-NEXT: .cfi_offset %ebp, -8
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %ebx, %edx
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: andl $1, %edi
-; X32-NEXT: negl %edi
-; X32-NEXT: andl %edi, %edx
-; X32-NEXT: xorl %ebx, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: xorl %ebp, %ebx
-; X32-NEXT: andl %edi, %ebx
-; X32-NEXT: xorl %ebp, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT: xorl %esi, %ebp
-; X32-NEXT: andl %edi, %ebp
-; X32-NEXT: xorl %esi, %ebp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: xorl %ecx, %esi
-; X32-NEXT: andl %edi, %esi
-; X32-NEXT: xorl %ecx, %esi
-; X32-NEXT: movl %esi, 12(%eax)
-; X32-NEXT: movl %ebp, 8(%eax)
-; X32-NEXT: movl %ebx, 4(%eax)
-; X32-NEXT: movl %edx, (%eax)
-; X32-NEXT: popl %esi
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: popl %edi
-; X32-NEXT: .cfi_def_cfa_offset 12
-; X32-NEXT: popl %ebx
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: popl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 4
-; X32-NEXT: retl $4
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel (%ecx), %eax
+; X32-NEXT: retl
;
-; X32-NOCMOV-LABEL: test_ctselect_v4i32:
+; X32-NOCMOV-LABEL: test_ctselect_load:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %ebp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
; X32-NOCMOV-NEXT: pushl %ebx
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT: pushl %edi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT: pushl %esi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20
-; X32-NOCMOV-NEXT: .cfi_offset %esi, -20
-; X32-NOCMOV-NEXT: .cfi_offset %edi, -16
-; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12
-; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ebx, %edx
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT: andl $1, %edi
-; X32-NOCMOV-NEXT: negl %edi
-; X32-NOCMOV-NEXT: andl %edi, %edx
-; X32-NOCMOV-NEXT: xorl %ebx, %edx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT: xorl %ebp, %ebx
-; X32-NOCMOV-NEXT: andl %edi, %ebx
-; X32-NOCMOV-NEXT: xorl %ebp, %ebx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT: xorl %esi, %ebp
-; X32-NOCMOV-NEXT: andl %edi, %ebp
-; X32-NOCMOV-NEXT: xorl %esi, %ebp
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: xorl %ecx, %esi
-; X32-NOCMOV-NEXT: andl %edi, %esi
-; X32-NOCMOV-NEXT: xorl %ecx, %esi
-; X32-NOCMOV-NEXT: movl %esi, 12(%eax)
-; X32-NOCMOV-NEXT: movl %ebp, 8(%eax)
-; X32-NOCMOV-NEXT: movl %ebx, 4(%eax)
-; X32-NOCMOV-NEXT: movl %edx, (%eax)
-; X32-NOCMOV-NEXT: popl %esi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT: popl %edi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT: popl %ebx
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: popl %ebp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
-; X32-NOCMOV-NEXT: retl $4
- %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
- ret <4 x i32> %result
-}
-define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
-; X64-LABEL: test_ctselect_v4f32:
-; X64: # %bb.0:
-; X64-NEXT: pxor %xmm1, %xmm0
-; X64-NEXT: movd %edi, %xmm2
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X64-NEXT: pslld $31, %xmm2
-; X64-NEXT: psrad $31, %xmm2
-; X64-NEXT: pand %xmm2, %xmm0
-; X64-NEXT: pxor %xmm1, %xmm0
-; X64-NEXT: retq
-;
-; X32-LABEL: test_ctselect_v4f32:
-; X32: # %bb.0:
-; X32-NEXT: pushl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: pushl %ebx
-; X32-NEXT: .cfi_def_cfa_offset 12
-; X32-NEXT: pushl %edi
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: pushl %esi
-; X32-NEXT: .cfi_def_cfa_offset 20
-; X32-NEXT: .cfi_offset %esi, -20
-; X32-NEXT: .cfi_offset %edi, -16
-; X32-NEXT: .cfi_offset %ebx, -12
-; X32-NEXT: .cfi_offset %ebp, -8
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %ebx, %edx
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: andl $1, %edi
-; X32-NEXT: negl %edi
-; X32-NEXT: andl %edi, %edx
-; X32-NEXT: xorl %ebx, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: xorl %ebp, %ebx
-; X32-NEXT: andl %edi, %ebx
-; X32-NEXT: xorl %ebp, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT: xorl %esi, %ebp
-; X32-NEXT: andl %edi, %ebp
-; X32-NEXT: xorl %esi, %ebp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: xorl %ecx, %esi
-; X32-NEXT: andl %edi, %esi
-; X32-NEXT: xorl %ecx, %esi
-; X32-NEXT: movl %esi, 12(%eax)
-; X32-NEXT: movl %ebp, 8(%eax)
-; X32-NEXT: movl %ebx, 4(%eax)
-; X32-NEXT: movl %edx, (%eax)
-; X32-NEXT: popl %esi
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: popl %edi
-; X32-NEXT: .cfi_def_cfa_offset 12
-; X32-NEXT: popl %ebx
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: popl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 4
-; X32-NEXT: retl $4
-;
-; X32-NOCMOV-LABEL: test_ctselect_v4f32:
-; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %ebp
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: pushl %ebx
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT: pushl %edi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
; X32-NOCMOV-NEXT: pushl %esi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20
-; X32-NOCMOV-NEXT: .cfi_offset %esi, -20
-; X32-NOCMOV-NEXT: .cfi_offset %edi, -16
-; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12
-; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ebx, %edx
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT: andl $1, %edi
-; X32-NOCMOV-NEXT: negl %edi
-; X32-NOCMOV-NEXT: andl %edi, %edx
-; X32-NOCMOV-NEXT: xorl %ebx, %edx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT: xorl %ebp, %ebx
-; X32-NOCMOV-NEXT: andl %edi, %ebx
-; X32-NOCMOV-NEXT: xorl %ebp, %ebx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT: xorl %esi, %ebp
-; X32-NOCMOV-NEXT: andl %edi, %ebp
-; X32-NOCMOV-NEXT: xorl %esi, %ebp
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: xorl %ecx, %esi
-; X32-NOCMOV-NEXT: andl %edi, %esi
-; X32-NOCMOV-NEXT: xorl %ecx, %esi
-; X32-NOCMOV-NEXT: movl %esi, 12(%eax)
-; X32-NOCMOV-NEXT: movl %ebp, 8(%eax)
-; X32-NOCMOV-NEXT: movl %ebx, 4(%eax)
-; X32-NOCMOV-NEXT: movl %edx, (%eax)
-; X32-NOCMOV-NEXT: popl %esi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT: popl %edi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT: popl %ebx
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: popl %ebp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
-; X32-NOCMOV-NEXT: retl $4
- %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
- ret <4 x float> %result
-}
-
-define <8 x i32> @test_ctselect_v8i32_avx(i1 %cond, <8 x i32> %a, <8 x i32> %b) {
-; X64-LABEL: test_ctselect_v8i32_avx:
-; X64: # %bb.0:
-; X64-NEXT: movd %edi, %xmm4
-; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
-; X64-NEXT: pslld $31, %xmm4
-; X64-NEXT: psrad $31, %xmm4
-; X64-NEXT: movdqa %xmm4, %xmm5
-; X64-NEXT: pandn %xmm2, %xmm5
-; X64-NEXT: pand %xmm4, %xmm0
-; X64-NEXT: por %xmm5, %xmm0
-; X64-NEXT: pand %xmm4, %xmm1
-; X64-NEXT: pandn %xmm3, %xmm4
-; X64-NEXT: por %xmm4, %xmm1
-; X64-NEXT: retq
-;
-; X32-LABEL: test_ctselect_v8i32_avx:
-; X32: # %bb.0:
-; X32-NEXT: pushl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: pushl %ebx
-; X32-NEXT: .cfi_def_cfa_offset 12
-; X32-NEXT: pushl %edi
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: pushl %esi
-; X32-NEXT: .cfi_def_cfa_offset 20
-; X32-NEXT: subl $8, %esp
-; X32-NEXT: .cfi_def_cfa_offset 28
-; X32-NEXT: .cfi_offset %esi, -20
-; X32-NEXT: .cfi_offset %edi, -16
-; X32-NEXT: .cfi_offset %ebx, -12
-; X32-NEXT: .cfi_offset %ebp, -8
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: xorl %eax, %ecx
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: andl $1, %edx
-; X32-NEXT: negl %edx
-; X32-NEXT: andl %edx, %ecx
-; X32-NEXT: xorl %eax, %ecx
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: xorl %esi, %eax
-; X32-NEXT: andl %edx, %eax
-; X32-NEXT: xorl %esi, %eax
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: xorl %ebx, %esi
-; X32-NEXT: andl %edx, %esi
-; X32-NEXT: xorl %ebx, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: xorl %ebp, %ebx
-; X32-NEXT: andl %edx, %ebx
-; X32-NEXT: xorl %ebp, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT: xorl %edi, %ebp
-; X32-NEXT: andl %edx, %ebp
-; X32-NEXT: xorl %edi, %ebp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: xorl %eax, %edi
-; X32-NEXT: andl %edx, %edi
-; X32-NEXT: xorl %eax, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: xorl %eax, %ecx
-; X32-NEXT: andl %edx, %ecx
-; X32-NEXT: xorl %eax, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl %edx, %eax
-; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %eax, 28(%edx)
-; X32-NEXT: movl %ecx, 24(%edx)
-; X32-NEXT: movl %edi, 20(%edx)
-; X32-NEXT: movl %ebp, 16(%edx)
-; X32-NEXT: movl %ebx, 12(%edx)
-; X32-NEXT: movl %esi, 8(%edx)
-; X32-NEXT: movl (%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 4(%edx)
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, (%edx)
-; X32-NEXT: movl %edx, %eax
-; X32-NEXT: addl $8, %esp
-; X32-NEXT: .cfi_def_cfa_offset 20
-; X32-NEXT: popl %esi
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: popl %edi
-; X32-NEXT: .cfi_def_cfa_offset 12
-; X32-NEXT: popl %ebx
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: popl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 4
-; X32-NEXT: retl $4
-;
-; X32-NOCMOV-LABEL: test_ctselect_v8i32_avx:
-; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %ebp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: pushl %ebx
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT: pushl %edi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT: pushl %esi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20
-; X32-NOCMOV-NEXT: subl $8, %esp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 28
-; X32-NOCMOV-NEXT: .cfi_offset %esi, -20
-; X32-NOCMOV-NEXT: .cfi_offset %edi, -16
-; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12
-; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %ecx
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: andl $1, %edx
-; X32-NOCMOV-NEXT: negl %edx
-; X32-NOCMOV-NEXT: andl %edx, %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %ecx
-; X32-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: xorl %esi, %eax
-; X32-NOCMOV-NEXT: andl %edx, %eax
-; X32-NOCMOV-NEXT: xorl %esi, %eax
-; X32-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: xorl %ebx, %esi
-; X32-NOCMOV-NEXT: andl %edx, %esi
-; X32-NOCMOV-NEXT: xorl %ebx, %esi
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT: xorl %ebp, %ebx
-; X32-NOCMOV-NEXT: andl %edx, %ebx
-; X32-NOCMOV-NEXT: xorl %ebp, %ebx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT: xorl %edi, %ebp
-; X32-NOCMOV-NEXT: andl %edx, %ebp
-; X32-NOCMOV-NEXT: xorl %edi, %ebp
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT: xorl %eax, %edi
-; X32-NOCMOV-NEXT: andl %edx, %edi
-; X32-NOCMOV-NEXT: xorl %eax, %edi
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %ecx
-; X32-NOCMOV-NEXT: andl %edx, %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %ecx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl %edx, %eax
-; X32-NOCMOV-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: movl %eax, 28(%edx)
-; X32-NOCMOV-NEXT: movl %ecx, 24(%edx)
-; X32-NOCMOV-NEXT: movl %edi, 20(%edx)
-; X32-NOCMOV-NEXT: movl %ebp, 16(%edx)
-; X32-NOCMOV-NEXT: movl %ebx, 12(%edx)
-; X32-NOCMOV-NEXT: movl %esi, 8(%edx)
-; X32-NOCMOV-NEXT: movl (%esp), %eax # 4-byte Reload
-; X32-NOCMOV-NEXT: movl %eax, 4(%edx)
-; X32-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NOCMOV-NEXT: movl %eax, (%edx)
+; X32-NOCMOV-NEXT: movl (%ecx), %ecx
+; X32-NOCMOV-NEXT: movl (%eax), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
; X32-NOCMOV-NEXT: movl %edx, %eax
-; X32-NOCMOV-NEXT: addl $8, %esp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
; X32-NOCMOV-NEXT: popl %esi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT: popl %edi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT: popl %ebx
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: popl %ebp
+; X32-NOCMOV-NEXT: popl %ebx
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
-; X32-NOCMOV-NEXT: retl $4
- %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b)
- ret <8 x i32> %result
+; X32-NOCMOV-NEXT: retl
+ %a = load i32, ptr %p1
+ %b = load i32, ptr %p2
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
}
-define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) {
-; X64-LABEL: test_ctselect_v8f32:
+; Test nested ctselect calls
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+; X64-LABEL: test_ctselect_nested:
; X64: # %bb.0:
-; X64-NEXT: movd %edi, %xmm4
-; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
-; X64-NEXT: pslld $31, %xmm4
-; X64-NEXT: psrad $31, %xmm4
-; X64-NEXT: movdqa %xmm4, %xmm5
-; X64-NEXT: pandn %xmm2, %xmm5
-; X64-NEXT: pand %xmm4, %xmm0
-; X64-NEXT: por %xmm5, %xmm0
-; X64-NEXT: pand %xmm4, %xmm1
-; X64-NEXT: pandn %xmm3, %xmm4
-; X64-NEXT: por %xmm4, %xmm1
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: testb $1, %sil
+; X64-NEXT: cmovnel %edx, %ecx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %ecx, %eax
; X64-NEXT: retq
;
-; X32-LABEL: test_ctselect_v8f32:
+; X32-LABEL: test_ctselect_nested:
; X32: # %bb.0:
-; X32-NEXT: pushl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: pushl %ebx
-; X32-NEXT: .cfi_def_cfa_offset 12
-; X32-NEXT: pushl %edi
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: pushl %esi
-; X32-NEXT: .cfi_def_cfa_offset 20
-; X32-NEXT: subl $8, %esp
-; X32-NEXT: .cfi_def_cfa_offset 28
-; X32-NEXT: .cfi_offset %esi, -20
-; X32-NEXT: .cfi_offset %edi, -16
-; X32-NEXT: .cfi_offset %ebx, -12
-; X32-NEXT: .cfi_offset %ebp, -8
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: xorl %eax, %ecx
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: andl $1, %edx
-; X32-NEXT: negl %edx
-; X32-NEXT: andl %edx, %ecx
-; X32-NEXT: xorl %eax, %ecx
-; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: xorl %esi, %eax
-; X32-NEXT: andl %edx, %eax
-; X32-NEXT: xorl %esi, %eax
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: xorl %ebx, %esi
-; X32-NEXT: andl %edx, %esi
-; X32-NEXT: xorl %ebx, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: xorl %ebp, %ebx
-; X32-NEXT: andl %edx, %ebx
-; X32-NEXT: xorl %ebp, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NEXT: xorl %edi, %ebp
-; X32-NEXT: andl %edx, %ebp
-; X32-NEXT: xorl %edi, %ebp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: xorl %eax, %edi
-; X32-NEXT: andl %edx, %edi
-; X32-NEXT: xorl %eax, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: xorl %eax, %ecx
-; X32-NEXT: andl %edx, %ecx
-; X32-NEXT: xorl %eax, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl %edx, %eax
-; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %eax, 28(%edx)
-; X32-NEXT: movl %ecx, 24(%edx)
-; X32-NEXT: movl %edi, 20(%edx)
-; X32-NEXT: movl %ebp, 16(%edx)
-; X32-NEXT: movl %ebx, 12(%edx)
-; X32-NEXT: movl %esi, 8(%edx)
-; X32-NEXT: movl (%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 4(%edx)
-; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, (%edx)
-; X32-NEXT: movl %edx, %eax
-; X32-NEXT: addl $8, %esp
-; X32-NEXT: .cfi_def_cfa_offset 20
-; X32-NEXT: popl %esi
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: popl %edi
-; X32-NEXT: .cfi_def_cfa_offset 12
-; X32-NEXT: popl %ebx
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: popl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 4
-; X32-NEXT: retl $4
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: retl
;
-; X32-NOCMOV-LABEL: test_ctselect_v8f32:
+; X32-NOCMOV-LABEL: test_ctselect_nested:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %ebp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
; X32-NOCMOV-NEXT: pushl %ebx
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
; X32-NOCMOV-NEXT: pushl %edi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
; X32-NOCMOV-NEXT: pushl %esi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20
-; X32-NOCMOV-NEXT: subl $8, %esp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 28
-; X32-NOCMOV-NEXT: .cfi_offset %esi, -20
-; X32-NOCMOV-NEXT: .cfi_offset %edi, -16
-; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12
-; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %ecx
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: andl $1, %edx
-; X32-NOCMOV-NEXT: negl %edx
-; X32-NOCMOV-NEXT: andl %edx, %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %ecx
-; X32-NOCMOV-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: xorl %esi, %eax
-; X32-NOCMOV-NEXT: andl %edx, %eax
-; X32-NOCMOV-NEXT: xorl %esi, %eax
-; X32-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: xorl %ebx, %esi
-; X32-NOCMOV-NEXT: andl %edx, %esi
-; X32-NOCMOV-NEXT: xorl %ebx, %esi
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NOCMOV-NEXT: xorl %ebp, %ebx
-; X32-NOCMOV-NEXT: andl %edx, %ebx
-; X32-NOCMOV-NEXT: xorl %ebp, %ebx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X32-NOCMOV-NEXT: xorl %edi, %ebp
-; X32-NOCMOV-NEXT: andl %edx, %ebp
-; X32-NOCMOV-NEXT: xorl %edi, %ebp
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NOCMOV-NEXT: xorl %eax, %edi
-; X32-NOCMOV-NEXT: andl %edx, %edi
-; X32-NOCMOV-NEXT: xorl %eax, %edi
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -16
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %ecx
-; X32-NOCMOV-NEXT: andl %edx, %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %ecx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl %edx, %eax
-; X32-NOCMOV-NEXT: xorl {{[0-9]+}}(%esp), %eax
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: movl %eax, 28(%edx)
-; X32-NOCMOV-NEXT: movl %ecx, 24(%edx)
-; X32-NOCMOV-NEXT: movl %edi, 20(%edx)
-; X32-NOCMOV-NEXT: movl %ebp, 16(%edx)
-; X32-NOCMOV-NEXT: movl %ebx, 12(%edx)
-; X32-NOCMOV-NEXT: movl %esi, 8(%edx)
-; X32-NOCMOV-NEXT: movl (%esp), %eax # 4-byte Reload
-; X32-NOCMOV-NEXT: movl %eax, 4(%edx)
-; X32-NOCMOV-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X32-NOCMOV-NEXT: movl %eax, (%edx)
-; X32-NOCMOV-NEXT: movl %edx, %eax
-; X32-NOCMOV-NEXT: addl $8, %esp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %eax, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %dl
+; X32-NOCMOV-NEXT: movb %dl, %dh
+; X32-NOCMOV-NEXT: movzbl %dh, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %ecx, %eax
+; X32-NOCMOV-NEXT: andl %edi, %eax
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %esi, %edi
+; X32-NOCMOV-NEXT: orl %edi, %eax
; X32-NOCMOV-NEXT: popl %esi
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT: popl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
-; X32-NOCMOV-NEXT: popl %ebx
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: popl %ebp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
-; X32-NOCMOV-NEXT: retl $4
- %result = call <8 x float> @llvm.ct.select.v8f32(i1 %cond, <8 x float> %a, <8 x float> %b)
- ret <8 x float> %result
-}
-
-define float @test_ctselect_f32_nan_inf(i1 %cond) {
-; X64-LABEL: test_ctselect_f32_nan_inf:
-; X64: # %bb.0:
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: negl %edi
-; X64-NEXT: andl $4194304, %edi # imm = 0x400000
-; X64-NEXT: orl $2139095040, %edi # imm = 0x7F800000
-; X64-NEXT: movd %edi, %xmm0
-; X64-NEXT: retq
-;
-; X32-LABEL: test_ctselect_f32_nan_inf:
-; X32: # %bb.0:
-; X32-NEXT: pushl %eax
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andb $1, %al
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: negl %eax
-; X32-NEXT: andl $4194304, %eax # imm = 0x400000
-; X32-NEXT: orl $2139095040, %eax # imm = 0x7F800000
-; X32-NEXT: movl %eax, (%esp)
-; X32-NEXT: flds (%esp)
-; X32-NEXT: popl %eax
-; X32-NEXT: .cfi_def_cfa_offset 4
-; X32-NEXT: retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_f32_nan_inf:
-; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %eax
+; X32-NOCMOV-NEXT: popl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andb $1, %al
-; X32-NOCMOV-NEXT: movzbl %al, %eax
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: andl $4194304, %eax # imm = 0x400000
-; X32-NOCMOV-NEXT: orl $2139095040, %eax # imm = 0x7F800000
-; X32-NOCMOV-NEXT: movl %eax, (%esp)
-; X32-NOCMOV-NEXT: flds (%esp)
-; X32-NOCMOV-NEXT: popl %eax
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
-; X32-NOCMOV-NEXT: retl
- %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
- ret float %result
-}
-
-define double @test_ctselect_f64_nan_inf(i1 %cond) {
-; X64-LABEL: test_ctselect_f64_nan_inf:
-; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: negq %rdi
-; X64-NEXT: movabsq $2251799813685248, %rax # imm = 0x8000000000000
-; X64-NEXT: andq %rdi, %rax
-; X64-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: movq %rcx, %xmm0
-; X64-NEXT: retq
-;
-; X32-LABEL: test_ctselect_f64_nan_inf:
-; X32: # %bb.0:
-; X32-NEXT: subl $12, %esp
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: negl %eax
-; X32-NEXT: andl $524288, %eax # imm = 0x80000
-; X32-NEXT: orl $2146435072, %eax # imm = 0x7FF00000
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X32-NEXT: movl $0, (%esp)
-; X32-NEXT: fldl (%esp)
-; X32-NEXT: addl $12, %esp
-; X32-NEXT: .cfi_def_cfa_offset 4
-; X32-NEXT: retl
-;
-; X32-NOCMOV-LABEL: test_ctselect_f64_nan_inf:
-; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: subl $12, %esp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: andl $524288, %eax # imm = 0x80000
-; X32-NOCMOV-NEXT: orl $2146435072, %eax # imm = 0x7FF00000
-; X32-NOCMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X32-NOCMOV-NEXT: movl $0, (%esp)
-; X32-NOCMOV-NEXT: fldl (%esp)
-; X32-NOCMOV-NEXT: addl $12, %esp
+; X32-NOCMOV-NEXT: popl %ebx
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
- %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000)
- ret double %result
+ %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+ %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
+ ret i32 %result
}
; Declare the intrinsics
-declare i1 @llvm.ct.select.i1(i1, i1, i1)
declare i8 @llvm.ct.select.i8(i1, i8, i8)
declare i16 @llvm.ct.select.i16(i1, i16, i16)
declare i32 @llvm.ct.select.i32(i1, i32, i32)
@@ -1526,13 +947,3 @@ declare i64 @llvm.ct.select.i64(i1, i64, i64)
declare float @llvm.ct.select.f32(i1, float, float)
declare double @llvm.ct.select.f64(i1, double, double)
declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
-
-; Vector intrinsics
-declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
-declare <8 x i16> @llvm.ct.select.v8i16(i1, <8 x i16>, <8 x i16>)
-declare <16 x i8> @llvm.ct.select.v16i8(i1, <16 x i8>, <16 x i8>)
-declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
-declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
-declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)
-declare <8 x float> @llvm.ct.select.v8f32(i1, <8 x float>, <8 x float>)
diff --git a/nasty-fix-constant.patch b/nasty-fix-constant.patch
new file mode 100644
index 0000000000000..07314e7f6985e
--- /dev/null
+++ b/nasty-fix-constant.patch
@@ -0,0 +1,2994 @@
+diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+index 81f992678626..fb25ab82a452 100644
+--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
++++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+@@ -4369,14 +4369,39 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
+ Node->getFlags()));
+ } else {
+ assert(VT.isInteger());
+- EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+- auto [Tmp2Lo, Tmp2Hi] = DAG.SplitScalar(Tmp2, dl, HalfVT, HalfVT);
+- auto [Tmp3Lo, Tmp3Hi] = DAG.SplitScalar(Tmp3, dl, HalfVT, HalfVT);
+- SDValue ResLo =
+- DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Lo, Tmp3Lo, Node->getFlags());
+- SDValue ResHi =
+- DAG.getCTSelect(dl, HalfVT, Tmp1, Tmp2Hi, Tmp3Hi, Node->getFlags());
+- Tmp1 = DAG.getNode(ISD::BUILD_PAIR, dl, VT, ResLo, ResHi);
++ // Expand scalar integer CT_SELECT to constant-time bitwise operations:
++ // Mask = 0 - (Cond & 1) // all-ones or all-zeros
++ // Result = F ^ ((T ^ F) & Mask)
++ //
++ // By expanding here (during legalization) rather than in
++ // SelectionDAGBuilder, the SETCC feeding the condition has already been
++ // legalized. This prevents visitSIGN_EXTEND in the post-legalization
++ // DAGCombiner from matching sext(setcc) -> select(setcc, -1, 0), which
++ // would convert the constant-time pattern back into a data-dependent
++ // conditional move.
++ //
++ // Note: We cannot use SIGN_EXTEND here because type legalization has
++ // already promoted the i1 condition to the target's SetCC type (e.g.
++ // i32 on MIPS). SIGN_EXTEND(i32, i32) would be a no-op, leaving the
++ // mask as 0/1 instead of 0/-1. Instead, we isolate the low bit and
++ // negate to create a proper all-bits mask. This handles all boolean
++ // content types (ZeroOrOne, ZeroOrNegativeOne, Undefined).
++ SDValue T = Tmp2;
++ SDValue F = Tmp3;
++ // Widen the condition to match VT if needed. Type legalization may
++ // promote the i1 condition to a narrower type than VT (e.g. i32
++ // SetCC result with i64 operands on MIPS64). ANY_EXTEND is safe
++ // because we immediately mask to the low bit.
++ SDValue Cond = Tmp1;
++ if (Cond.getValueType() != VT)
++ Cond = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Cond);
++ SDValue One = DAG.getConstant(1, dl, VT);
++ SDValue Bit = DAG.getNode(ISD::AND, dl, VT, Cond, One);
++ SDValue Zero = DAG.getConstant(0, dl, VT);
++ SDValue Mask = DAG.getNode(ISD::SUB, dl, VT, Zero, Bit);
++ SDValue XorTF = DAG.getNode(ISD::XOR, dl, VT, T, F);
++ SDValue MaskedDiff = DAG.getNode(ISD::AND, dl, VT, XorTF, Mask);
++ Tmp1 = DAG.getNode(ISD::XOR, dl, VT, F, MaskedDiff);
+ Tmp1->setFlags(Node->getFlags());
+ }
+ Results.push_back(Tmp1);
+diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+index 156d82e96b2a..1c68822563ed 100644
+--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
++++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+@@ -6872,9 +6872,41 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
+ // assert if Cond type is Vector
+ assert(!CondVT.isVector() && "Vector type cond not supported yet");
+
+- // Handle scalar types
+- if (TLI.isOperationLegalOrCustom(ISD::CT_SELECT, VT) &&
+- !CondVT.isVector()) {
++ // Decide whether to create a CT_SELECT DAG node or use the inline
++ // fallback expansion. CT_SELECT nodes are protected by visitCT_SELECT
++ // in DAGCombiner from unsafe folds (e.g. sext(setcc) -> select) that
++ // break constant-time guarantees.
++ //
++ // We create CT_SELECT when:
++ // 1. Target has Legal/Custom support for this type.
++ // 2. Scalar integer types — type legalization splits wide types (e.g.
++ // i64 on 32-bit targets) before operation legalization expands to
++ // AND/OR/XOR.
++ // 3. Scalar float types where the integer equivalent is legal — the
++ // expansion bitcasts to integer for bitwise ops.
++ //
++ // We use the inline fallback when:
++ // - Vector types without target support (Expand) — the legalization
++ // expansion uses getSplatBuildVector + SIGN_EXTEND of vector i1
++ // which not all targets support.
++ // - Float types where the integer equivalent is illegal (e.g. f64 on
++ // i386 maps to i64, which is illegal). The expansion creates new
++ // nodes during operation legalization that can't be further
++ // type-legalized. The inline fallback runs before type legalization
++ // so the i64 ops get properly split.
++ bool CreateNode;
++ if (TLI.isOperationLegalOrCustom(ISD::CT_SELECT, VT)) {
++ CreateNode = true;
++ } else if (VT.isVector()) {
++ CreateNode = false;
++ } else if (VT.isFloatingPoint()) {
++ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
++ CreateNode = TLI.isTypeLegal(IntVT);
++ } else {
++ CreateNode = true; // Scalar integer — always safe
++ }
++
++ if (CreateNode) {
+ SDValue Result = DAG.getNode(ISD::CT_SELECT, DL, VT, Cond, A, B);
+ setValue(&I, Result);
+ return;
+diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll
+index f1831a625d4a..401a742c27ea 100644
+--- a/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll
++++ b/llvm/test/CodeGen/Mips/ctselect-fallback-edge-cases.ll
+@@ -8,22 +8,24 @@
+ define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+ ; M32-LABEL: test_ctselect_i1:
+ ; M32: # %bb.0:
+-; M32-NEXT: xori $2, $4, 1
+-; M32-NEXT: and $1, $4, $5
+-; M32-NEXT: and $2, $2, $6
++; M32-NEXT: andi $2, $4, 1
++; M32-NEXT: xor $1, $5, $6
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $1, $2
++; M32-NEXT: xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_i1:
+ ; M64: # %bb.0:
+-; M64-NEXT: sll $2, $4, 0
+-; M64-NEXT: sll $1, $6, 0
+-; M64-NEXT: xori $2, $2, 1
+-; M64-NEXT: and $1, $2, $1
+-; M64-NEXT: and $2, $4, $5
++; M64-NEXT: sll $1, $4, 0
++; M64-NEXT: xor $2, $5, $6
++; M64-NEXT: andi $1, $1, 1
+ ; M64-NEXT: sll $2, $2, 0
++; M64-NEXT: negu $1, $1
++; M64-NEXT: and $1, $2, $1
++; M64-NEXT: sll $2, $6, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $2, $1
+ %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+ ret i1 %result
+ }
+@@ -32,30 +34,18 @@ define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+ define i32 @test_ctselect_extremal_values(i1 %cond) {
+ ; M32-LABEL: test_ctselect_extremal_values:
+ ; M32: # %bb.0:
+-; M32-NEXT: lui $3, 32767
+ ; M32-NEXT: andi $1, $4, 1
+-; M32-NEXT: negu $2, $1
+-; M32-NEXT: ori $3, $3, 65535
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $2, $3
+-; M32-NEXT: lui $3, 32768
+-; M32-NEXT: and $1, $1, $3
++; M32-NEXT: lui $2, 32768
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: subu $2, $2, $1
+ ;
+ ; M64-LABEL: test_ctselect_extremal_values:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $4, 0
+-; M64-NEXT: lui $3, 32767
++; M64-NEXT: lui $2, 32768
+ ; M64-NEXT: andi $1, $1, 1
+-; M64-NEXT: ori $3, $3, 65535
+-; M64-NEXT: negu $2, $1
+-; M64-NEXT: addiu $1, $1, -1
+-; M64-NEXT: and $2, $2, $3
+-; M64-NEXT: lui $3, 32768
+-; M64-NEXT: and $1, $1, $3
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: subu $2, $2, $1
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648)
+ ret i32 %result
+ }
+@@ -67,14 +57,14 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+ ; M32-NEXT: andi $1, $4, 1
+ ; M32-NEXT: negu $1, $1
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: and $2, $1, $5
++; M32-NEXT: and $2, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_null_ptr:
+ ; M64: # %bb.0:
+ ; M64-NEXT: andi $1, $4, 1
+ ; M64-NEXT: dnegu $1, $1
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: and $2, $1, $5
++; M64-NEXT: and $2, $5, $1
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+ ret ptr %result
+ }
+@@ -83,23 +73,21 @@ define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+ define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+ ; M32-LABEL: test_ctselect_function_ptr:
+ ; M32: # %bb.0:
+-; M32-NEXT: andi $1, $4, 1
+-; M32-NEXT: negu $2, $1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $2, $5
+-; M32-NEXT: and $1, $1, $6
++; M32-NEXT: andi $2, $4, 1
++; M32-NEXT: xor $1, $5, $6
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_function_ptr:
+ ; M64: # %bb.0:
+-; M64-NEXT: andi $1, $4, 1
+-; M64-NEXT: dnegu $2, $1
+-; M64-NEXT: daddiu $1, $1, -1
+-; M64-NEXT: and $2, $2, $5
+-; M64-NEXT: and $1, $1, $6
++; M64-NEXT: andi $2, $4, 1
++; M64-NEXT: xor $1, $5, $6
++; M64-NEXT: dnegu $2, $2
++; M64-NEXT: and $1, $1, $2
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $6, $1
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+ ret ptr %result
+ }
+@@ -108,26 +96,25 @@ define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+ define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+ ; M32-LABEL: test_ctselect_ptr_cmp:
+ ; M32: # %bb.0:
+-; M32-NEXT: xor $1, $4, $5
+-; M32-NEXT: sltu $1, $zero, $1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $1, $6
+-; M32-NEXT: not $1, $1
+-; M32-NEXT: and $1, $1, $7
++; M32-NEXT: xor $2, $4, $5
++; M32-NEXT: xor $1, $6, $7
++; M32-NEXT: sltiu $2, $2, 1
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $7, $1
+ ;
+ ; M64-LABEL: test_ctselect_ptr_cmp:
+ ; M64: # %bb.0:
+-; M64-NEXT: xor $1, $4, $5
+-; M64-NEXT: daddiu $3, $zero, -1
+-; M64-NEXT: daddiu $2, $zero, -1
+-; M64-NEXT: movn $3, $zero, $1
+-; M64-NEXT: xor $2, $3, $2
+-; M64-NEXT: and $1, $3, $6
+-; M64-NEXT: and $2, $2, $7
++; M64-NEXT: xor $2, $4, $5
++; M64-NEXT: xor $1, $6, $7
++; M64-NEXT: sltiu $2, $2, 1
++; M64-NEXT: dsll $2, $2, 32
++; M64-NEXT: dsrl $2, $2, 32
++; M64-NEXT: dnegu $2, $2
++; M64-NEXT: and $1, $1, $2
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $1, $2
++; M64-NEXT: xor $2, $7, $1
+ %cmp = icmp eq ptr %p1, %p2
+ %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b)
+ ret ptr %result
+@@ -139,23 +126,21 @@ define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+ ; M32-LABEL: test_ctselect_struct_ptr:
+ ; M32: # %bb.0:
+-; M32-NEXT: andi $1, $4, 1
+-; M32-NEXT: negu $2, $1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $2, $5
+-; M32-NEXT: and $1, $1, $6
++; M32-NEXT: andi $2, $4, 1
++; M32-NEXT: xor $1, $5, $6
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_struct_ptr:
+ ; M64: # %bb.0:
+-; M64-NEXT: andi $1, $4, 1
+-; M64-NEXT: dnegu $2, $1
+-; M64-NEXT: daddiu $1, $1, -1
+-; M64-NEXT: and $2, $2, $5
+-; M64-NEXT: and $1, $1, $6
++; M64-NEXT: andi $2, $4, 1
++; M64-NEXT: xor $1, $5, $6
++; M64-NEXT: dnegu $2, $2
++; M64-NEXT: and $1, $1, $2
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $6, $1
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+ ret ptr %result
+ }
+@@ -164,73 +149,65 @@ define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+ define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+ ; M32-LABEL: test_ctselect_deeply_nested:
+ ; M32: # %bb.0:
+-; M32-NEXT: andi $1, $4, 1
+-; M32-NEXT: lw $3, 16($sp)
+-; M32-NEXT: lw $9, 32($sp)
+-; M32-NEXT: lw $8, 28($sp)
+-; M32-NEXT: negu $2, $1
+-; M32-NEXT: addiu $1, $1, -1
++; M32-NEXT: lw $1, 20($sp)
++; M32-NEXT: lw $2, 16($sp)
++; M32-NEXT: andi $3, $4, 1
++; M32-NEXT: andi $4, $6, 1
++; M32-NEXT: lw $6, 28($sp)
++; M32-NEXT: negu $3, $3
++; M32-NEXT: xor $2, $2, $1
+ ; M32-NEXT: and $2, $2, $3
+-; M32-NEXT: lw $3, 20($sp)
+-; M32-NEXT: and $1, $1, $3
+ ; M32-NEXT: andi $3, $5, 1
+-; M32-NEXT: or $1, $2, $1
+-; M32-NEXT: andi $2, $6, 1
+-; M32-NEXT: andi $6, $7, 1
+-; M32-NEXT: negu $4, $3
+-; M32-NEXT: addiu $3, $3, -1
+-; M32-NEXT: addiu $7, $6, -1
+-; M32-NEXT: and $1, $4, $1
+-; M32-NEXT: addiu $5, $2, -1
+-; M32-NEXT: negu $2, $2
+-; M32-NEXT: negu $6, $6
+-; M32-NEXT: and $4, $7, $9
+-; M32-NEXT: lw $7, 24($sp)
+-; M32-NEXT: and $5, $5, $8
+-; M32-NEXT: and $3, $3, $7
+-; M32-NEXT: or $1, $1, $3
+-; M32-NEXT: and $1, $2, $1
+-; M32-NEXT: or $1, $1, $5
+-; M32-NEXT: and $1, $6, $1
++; M32-NEXT: lw $5, 32($sp)
++; M32-NEXT: xor $1, $1, $2
++; M32-NEXT: lw $2, 24($sp)
++; M32-NEXT: negu $3, $3
++; M32-NEXT: xor $1, $1, $2
++; M32-NEXT: and $1, $1, $3
++; M32-NEXT: andi $3, $7, 1
++; M32-NEXT: xor $1, $2, $1
++; M32-NEXT: negu $2, $4
++; M32-NEXT: negu $3, $3
++; M32-NEXT: xor $1, $1, $6
++; M32-NEXT: and $1, $1, $2
++; M32-NEXT: xor $1, $6, $1
++; M32-NEXT: xor $1, $1, $5
++; M32-NEXT: and $1, $1, $3
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $1, $4
++; M32-NEXT: xor $2, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_deeply_nested:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $4, 0
+-; M64-NEXT: sll $3, $8, 0
+-; M64-NEXT: sll $4, $5, 0
+-; M64-NEXT: lw $8, 0($sp)
++; M64-NEXT: xor $2, $8, $9
++; M64-NEXT: sll $5, $5, 0
++; M64-NEXT: sll $3, $6, 0
++; M64-NEXT: sll $6, $11, 0
++; M64-NEXT: sll $4, $7, 0
++; M64-NEXT: lw $7, 0($sp)
+ ; M64-NEXT: andi $1, $1, 1
++; M64-NEXT: sll $2, $2, 0
++; M64-NEXT: andi $5, $5, 1
++; M64-NEXT: andi $3, $3, 1
+ ; M64-NEXT: andi $4, $4, 1
+-; M64-NEXT: negu $2, $1
+-; M64-NEXT: addiu $1, $1, -1
+-; M64-NEXT: negu $5, $4
+-; M64-NEXT: addiu $4, $4, -1
+-; M64-NEXT: and $2, $2, $3
+-; M64-NEXT: sll $3, $9, 0
+-; M64-NEXT: and $1, $1, $3
+-; M64-NEXT: sll $3, $11, 0
+-; M64-NEXT: or $1, $2, $1
+-; M64-NEXT: sll $2, $6, 0
+-; M64-NEXT: sll $6, $7, 0
+-; M64-NEXT: andi $2, $2, 1
+-; M64-NEXT: and $1, $5, $1
+-; M64-NEXT: andi $6, $6, 1
+-; M64-NEXT: addiu $5, $2, -1
+-; M64-NEXT: negu $2, $2
+-; M64-NEXT: addiu $7, $6, -1
+-; M64-NEXT: negu $6, $6
+-; M64-NEXT: and $3, $5, $3
+-; M64-NEXT: sll $5, $10, 0
+-; M64-NEXT: and $7, $7, $8
+-; M64-NEXT: and $4, $4, $5
+-; M64-NEXT: or $1, $1, $4
++; M64-NEXT: negu $1, $1
++; M64-NEXT: negu $5, $5
++; M64-NEXT: negu $4, $4
+ ; M64-NEXT: and $1, $2, $1
+-; M64-NEXT: or $1, $1, $3
+-; M64-NEXT: and $1, $6, $1
++; M64-NEXT: sll $2, $9, 0
++; M64-NEXT: xor $1, $2, $1
++; M64-NEXT: sll $2, $10, 0
++; M64-NEXT: xor $1, $1, $2
++; M64-NEXT: and $1, $1, $5
++; M64-NEXT: xor $1, $2, $1
++; M64-NEXT: negu $2, $3
++; M64-NEXT: xor $1, $1, $6
++; M64-NEXT: and $1, $1, $2
++; M64-NEXT: xor $1, $6, $1
++; M64-NEXT: xor $1, $1, $7
++; M64-NEXT: and $1, $1, $4
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $1, $7
++; M64-NEXT: xor $2, $7, $1
+ %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+ %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+ %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll
+index 2e65e586ce5f..a1c5d524c693 100644
+--- a/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll
++++ b/llvm/test/CodeGen/Mips/ctselect-fallback-patterns.ll
+@@ -6,16 +6,18 @@
+ define i32 @test_ctselect_smin_zero(i32 %x) {
+ ; M32-LABEL: test_ctselect_smin_zero:
+ ; M32: # %bb.0:
+-; M32-NEXT: sra $1, $4, 31
++; M32-NEXT: slti $1, $4, 0
++; M32-NEXT: negu $1, $1
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: and $2, $1, $4
++; M32-NEXT: and $2, $4, $1
+ ;
+ ; M64-LABEL: test_ctselect_smin_zero:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $4, 0
+-; M64-NEXT: sra $2, $1, 31
++; M64-NEXT: slti $2, $1, 0
++; M64-NEXT: negu $2, $2
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: and $2, $2, $1
++; M64-NEXT: and $2, $1, $2
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+ ret i32 %result
+@@ -25,17 +27,18 @@ define i32 @test_ctselect_smin_zero(i32 %x) {
+ define i32 @test_ctselect_smax_zero(i32 %x) {
+ ; M32-LABEL: test_ctselect_smax_zero:
+ ; M32: # %bb.0:
+-; M32-NEXT: slti $1, $4, 1
+-; M32-NEXT: movn $4, $zero, $1
++; M32-NEXT: slt $1, $zero, $4
++; M32-NEXT: negu $1, $1
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: move $2, $4
++; M32-NEXT: and $2, $4, $1
+ ;
+ ; M64-LABEL: test_ctselect_smax_zero:
+ ; M64: # %bb.0:
+-; M64-NEXT: sll $2, $4, 0
+-; M64-NEXT: slti $1, $2, 1
++; M64-NEXT: sll $1, $4, 0
++; M64-NEXT: slt $2, $zero, $1
++; M64-NEXT: negu $2, $2
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: movn $2, $zero, $1
++; M64-NEXT: and $2, $1, $2
+ %cmp = icmp sgt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+ ret i32 %result
+@@ -45,27 +48,23 @@ define i32 @test_ctselect_smax_zero(i32 %x) {
+ define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+ ; M32-LABEL: test_ctselect_smin_generic:
+ ; M32: # %bb.0:
+-; M32-NEXT: slt $1, $4, $5
+-; M32-NEXT: xori $1, $1, 1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $1, $4
+-; M32-NEXT: not $1, $1
+-; M32-NEXT: and $1, $1, $5
++; M32-NEXT: slt $2, $4, $5
++; M32-NEXT: xor $1, $4, $5
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_smin_generic:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $5, 0
+ ; M64-NEXT: sll $2, $4, 0
+-; M64-NEXT: slt $3, $2, $1
+-; M64-NEXT: xori $3, $3, 1
+-; M64-NEXT: addiu $3, $3, -1
++; M64-NEXT: xor $3, $2, $1
++; M64-NEXT: slt $2, $2, $1
++; M64-NEXT: negu $2, $2
+ ; M64-NEXT: and $2, $3, $2
+-; M64-NEXT: not $3, $3
+-; M64-NEXT: and $1, $3, $1
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $1, $2
+ %cmp = icmp slt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+@@ -75,27 +74,23 @@ define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+ define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+ ; M32-LABEL: test_ctselect_smax_generic:
+ ; M32: # %bb.0:
+-; M32-NEXT: slt $1, $5, $4
+-; M32-NEXT: xori $1, $1, 1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $1, $4
+-; M32-NEXT: not $1, $1
+-; M32-NEXT: and $1, $1, $5
++; M32-NEXT: slt $2, $5, $4
++; M32-NEXT: xor $1, $4, $5
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_smax_generic:
+ ; M64: # %bb.0:
+-; M64-NEXT: sll $1, $4, 0
+-; M64-NEXT: sll $2, $5, 0
+-; M64-NEXT: slt $3, $2, $1
+-; M64-NEXT: xori $3, $3, 1
+-; M64-NEXT: addiu $3, $3, -1
+-; M64-NEXT: and $1, $3, $1
+-; M64-NEXT: not $3, $3
++; M64-NEXT: sll $1, $5, 0
++; M64-NEXT: sll $2, $4, 0
++; M64-NEXT: xor $3, $2, $1
++; M64-NEXT: slt $2, $1, $2
++; M64-NEXT: negu $2, $2
+ ; M64-NEXT: and $2, $3, $2
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $1, $2
++; M64-NEXT: xor $2, $1, $2
+ %cmp = icmp sgt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+@@ -105,27 +100,23 @@ define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+ define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+ ; M32-LABEL: test_ctselect_umin_generic:
+ ; M32: # %bb.0:
+-; M32-NEXT: sltu $1, $4, $5
+-; M32-NEXT: xori $1, $1, 1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $1, $4
+-; M32-NEXT: not $1, $1
+-; M32-NEXT: and $1, $1, $5
++; M32-NEXT: sltu $2, $4, $5
++; M32-NEXT: xor $1, $4, $5
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_umin_generic:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $5, 0
+ ; M64-NEXT: sll $2, $4, 0
+-; M64-NEXT: sltu $3, $2, $1
+-; M64-NEXT: xori $3, $3, 1
+-; M64-NEXT: addiu $3, $3, -1
++; M64-NEXT: xor $3, $2, $1
++; M64-NEXT: sltu $2, $2, $1
++; M64-NEXT: negu $2, $2
+ ; M64-NEXT: and $2, $3, $2
+-; M64-NEXT: not $3, $3
+-; M64-NEXT: and $1, $3, $1
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $1, $2
+ %cmp = icmp ult i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+@@ -135,27 +126,23 @@ define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+ ; M32-LABEL: test_ctselect_umax_generic:
+ ; M32: # %bb.0:
+-; M32-NEXT: sltu $1, $5, $4
+-; M32-NEXT: xori $1, $1, 1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $1, $4
+-; M32-NEXT: not $1, $1
+-; M32-NEXT: and $1, $1, $5
++; M32-NEXT: sltu $2, $5, $4
++; M32-NEXT: xor $1, $4, $5
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_umax_generic:
+ ; M64: # %bb.0:
+-; M64-NEXT: sll $1, $4, 0
+-; M64-NEXT: sll $2, $5, 0
+-; M64-NEXT: sltu $3, $2, $1
+-; M64-NEXT: xori $3, $3, 1
+-; M64-NEXT: addiu $3, $3, -1
+-; M64-NEXT: and $1, $3, $1
+-; M64-NEXT: not $3, $3
++; M64-NEXT: sll $1, $5, 0
++; M64-NEXT: sll $2, $4, 0
++; M64-NEXT: xor $3, $2, $1
++; M64-NEXT: sltu $2, $1, $2
++; M64-NEXT: negu $2, $2
+ ; M64-NEXT: and $2, $3, $2
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $1, $2
++; M64-NEXT: xor $2, $1, $2
+ %cmp = icmp ugt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+@@ -165,24 +152,24 @@ define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+ define i32 @test_ctselect_abs(i32 %x) {
+ ; M32-LABEL: test_ctselect_abs:
+ ; M32: # %bb.0:
+-; M32-NEXT: negu $1, $4
+-; M32-NEXT: sra $2, $4, 31
++; M32-NEXT: slti $1, $4, 0
++; M32-NEXT: negu $2, $4
++; M32-NEXT: negu $1, $1
++; M32-NEXT: xor $2, $2, $4
+ ; M32-NEXT: and $1, $2, $1
+-; M32-NEXT: not $2, $2
+-; M32-NEXT: and $2, $2, $4
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $1, $2
++; M32-NEXT: xor $2, $4, $1
+ ;
+ ; M64-LABEL: test_ctselect_abs:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $4, 0
+-; M64-NEXT: negu $2, $1
+-; M64-NEXT: sra $3, $1, 31
++; M64-NEXT: slti $2, $1, 0
++; M64-NEXT: negu $3, $1
++; M64-NEXT: negu $2, $2
++; M64-NEXT: xor $3, $3, $1
+ ; M64-NEXT: and $2, $3, $2
+-; M64-NEXT: not $3, $3
+-; M64-NEXT: and $1, $3, $1
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $1, $2
+ %neg = sub i32 0, %x
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x)
+@@ -193,24 +180,24 @@ define i32 @test_ctselect_abs(i32 %x) {
+ define i32 @test_ctselect_nabs(i32 %x) {
+ ; M32-LABEL: test_ctselect_nabs:
+ ; M32: # %bb.0:
+-; M32-NEXT: sra $1, $4, 31
+-; M32-NEXT: negu $3, $4
+-; M32-NEXT: and $2, $1, $4
+-; M32-NEXT: not $1, $1
+-; M32-NEXT: and $1, $1, $3
++; M32-NEXT: slti $1, $4, 0
++; M32-NEXT: negu $2, $4
++; M32-NEXT: negu $1, $1
++; M32-NEXT: xor $3, $4, $2
++; M32-NEXT: and $1, $3, $1
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $2, $1
+ ;
+ ; M64-LABEL: test_ctselect_nabs:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $4, 0
+-; M64-NEXT: sra $2, $1, 31
+-; M64-NEXT: and $3, $2, $1
+-; M64-NEXT: negu $1, $1
+-; M64-NEXT: not $2, $2
+-; M64-NEXT: and $1, $2, $1
++; M64-NEXT: slti $2, $1, 0
++; M64-NEXT: negu $3, $1
++; M64-NEXT: negu $2, $2
++; M64-NEXT: xor $1, $1, $3
++; M64-NEXT: and $1, $1, $2
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $3, $1
++; M64-NEXT: xor $2, $3, $1
+ %neg = sub i32 0, %x
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg)
+@@ -221,14 +208,16 @@ define i32 @test_ctselect_nabs(i32 %x) {
+ define i32 @test_ctselect_sign_extend(i32 %x) {
+ ; M32-LABEL: test_ctselect_sign_extend:
+ ; M32: # %bb.0:
++; M32-NEXT: slti $1, $4, 0
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: sra $2, $4, 31
++; M32-NEXT: negu $2, $1
+ ;
+ ; M64-LABEL: test_ctselect_sign_extend:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $4, 0
++; M64-NEXT: slti $1, $1, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: sra $2, $1, 31
++; M64-NEXT: negu $2, $1
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+ ret i32 %result
+@@ -270,13 +259,12 @@ define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_constant_folding_false:
+ ; M32: # %bb.0:
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $zero, $5
++; M32-NEXT: move $2, $5
+ ;
+ ; M64-LABEL: test_ctselect_constant_folding_false:
+ ; M64: # %bb.0:
+-; M64-NEXT: sll $1, $5, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $zero, $1
++; M64-NEXT: sll $2, $5, 0
+ %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+ ret i32 %result
+ }
+@@ -285,25 +273,13 @@ define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+ define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+ ; M32-LABEL: test_ctselect_identical_operands:
+ ; M32: # %bb.0:
+-; M32-NEXT: andi $1, $4, 1
+-; M32-NEXT: negu $2, $1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $2, $5
+-; M32-NEXT: and $1, $1, $5
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: move $2, $5
+ ;
+ ; M64-LABEL: test_ctselect_identical_operands:
+ ; M64: # %bb.0:
+-; M64-NEXT: sll $1, $4, 0
+-; M64-NEXT: sll $3, $5, 0
+-; M64-NEXT: andi $1, $1, 1
+-; M64-NEXT: negu $2, $1
+-; M64-NEXT: addiu $1, $1, -1
+-; M64-NEXT: and $2, $2, $3
+-; M64-NEXT: and $1, $1, $3
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: sll $2, $5, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x)
+ ret i32 %result
+ }
+@@ -312,29 +288,27 @@ define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+ define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_inverted_condition:
+ ; M32: # %bb.0:
+-; M32-NEXT: xor $1, $4, $5
+-; M32-NEXT: sltiu $1, $1, 1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $1, $6
+-; M32-NEXT: not $1, $1
+-; M32-NEXT: and $1, $1, $7
++; M32-NEXT: xor $2, $4, $5
++; M32-NEXT: xor $1, $7, $6
++; M32-NEXT: sltiu $2, $2, 1
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_inverted_condition:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $5, 0
+ ; M64-NEXT: sll $2, $4, 0
+-; M64-NEXT: sll $3, $7, 0
+ ; M64-NEXT: xor $1, $2, $1
+-; M64-NEXT: sll $2, $6, 0
++; M64-NEXT: xor $2, $7, $6
+ ; M64-NEXT: sltiu $1, $1, 1
+-; M64-NEXT: addiu $1, $1, -1
+-; M64-NEXT: and $2, $1, $2
+-; M64-NEXT: not $1, $1
+-; M64-NEXT: and $1, $1, $3
++; M64-NEXT: sll $2, $2, 0
++; M64-NEXT: negu $1, $1
++; M64-NEXT: and $1, $2, $1
++; M64-NEXT: sll $2, $6, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $2, $1
+ %cmp = icmp eq i32 %x, %y
+ %not_cmp = xor i1 %cmp, true
+ %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b)
+@@ -345,57 +319,51 @@ define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) {
+ ; M32-LABEL: test_ctselect_chain:
+ ; M32: # %bb.0:
+-; M32-NEXT: andi $1, $4, 1
++; M32-NEXT: lw $1, 16($sp)
++; M32-NEXT: andi $3, $4, 1
++; M32-NEXT: negu $3, $3
++; M32-NEXT: xor $2, $7, $1
++; M32-NEXT: and $2, $2, $3
+ ; M32-NEXT: andi $3, $5, 1
+-; M32-NEXT: lw $5, 16($sp)
+-; M32-NEXT: negu $2, $1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: negu $4, $3
+-; M32-NEXT: addiu $3, $3, -1
+-; M32-NEXT: and $1, $1, $5
+-; M32-NEXT: and $2, $2, $7
+-; M32-NEXT: lw $5, 24($sp)
+-; M32-NEXT: or $1, $2, $1
++; M32-NEXT: xor $1, $1, $2
++; M32-NEXT: lw $2, 20($sp)
++; M32-NEXT: negu $3, $3
++; M32-NEXT: xor $1, $1, $2
++; M32-NEXT: and $1, $1, $3
++; M32-NEXT: lw $3, 24($sp)
++; M32-NEXT: xor $1, $2, $1
+ ; M32-NEXT: andi $2, $6, 1
+-; M32-NEXT: and $1, $4, $1
+-; M32-NEXT: addiu $4, $2, -1
++; M32-NEXT: xor $1, $1, $3
+ ; M32-NEXT: negu $2, $2
+-; M32-NEXT: and $4, $4, $5
+-; M32-NEXT: lw $5, 20($sp)
+-; M32-NEXT: and $3, $3, $5
+-; M32-NEXT: or $1, $1, $3
+-; M32-NEXT: and $1, $2, $1
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $1, $4
++; M32-NEXT: xor $2, $3, $1
+ ;
+ ; M64-LABEL: test_ctselect_chain:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $4, 0
+-; M64-NEXT: sll $3, $7, 0
+-; M64-NEXT: sll $4, $5, 0
++; M64-NEXT: xor $2, $7, $8
++; M64-NEXT: sll $3, $5, 0
+ ; M64-NEXT: andi $1, $1, 1
+-; M64-NEXT: andi $4, $4, 1
+-; M64-NEXT: negu $2, $1
+-; M64-NEXT: addiu $1, $1, -1
+-; M64-NEXT: negu $5, $4
+-; M64-NEXT: addiu $4, $4, -1
+-; M64-NEXT: and $2, $2, $3
+-; M64-NEXT: sll $3, $8, 0
+-; M64-NEXT: and $1, $1, $3
+-; M64-NEXT: sll $3, $6, 0
+-; M64-NEXT: sll $6, $10, 0
+-; M64-NEXT: or $1, $2, $1
++; M64-NEXT: sll $2, $2, 0
+ ; M64-NEXT: andi $3, $3, 1
+-; M64-NEXT: and $1, $5, $1
+-; M64-NEXT: sll $5, $9, 0
+-; M64-NEXT: addiu $2, $3, -1
++; M64-NEXT: negu $1, $1
+ ; M64-NEXT: negu $3, $3
+-; M64-NEXT: and $4, $4, $5
+-; M64-NEXT: and $2, $2, $6
+-; M64-NEXT: or $1, $1, $4
+-; M64-NEXT: and $1, $3, $1
++; M64-NEXT: and $1, $2, $1
++; M64-NEXT: sll $2, $8, 0
++; M64-NEXT: xor $1, $2, $1
++; M64-NEXT: sll $2, $9, 0
++; M64-NEXT: xor $1, $1, $2
++; M64-NEXT: and $1, $1, $3
++; M64-NEXT: sll $3, $6, 0
++; M64-NEXT: xor $1, $2, $1
++; M64-NEXT: andi $2, $3, 1
++; M64-NEXT: sll $3, $10, 0
++; M64-NEXT: xor $1, $1, $3
++; M64-NEXT: negu $2, $2
++; M64-NEXT: and $1, $1, $2
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $1, $2
++; M64-NEXT: xor $2, $3, $1
+ %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+ %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+ %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+@@ -406,16 +374,17 @@ define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c,
+ define i64 @test_ctselect_i64_smin_zero(i64 %x) {
+ ; M32-LABEL: test_ctselect_i64_smin_zero:
+ ; M32: # %bb.0:
+-; M32-NEXT: sra $1, $5, 31
+-; M32-NEXT: and $2, $1, $4
++; M32-NEXT: slti $1, $5, 0
++; M32-NEXT: negu $1, $1
++; M32-NEXT: and $2, $4, $1
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: and $3, $1, $5
++; M32-NEXT: and $3, $5, $1
+ ;
+ ; M64-LABEL: test_ctselect_i64_smin_zero:
+ ; M64: # %bb.0:
+ ; M64-NEXT: dsra $1, $4, 63
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: and $2, $1, $4
++; M64-NEXT: and $2, $4, $1
+ %cmp = icmp slt i64 %x, 0
+ %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0)
+ ret i64 %result
+diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll b/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll
+index 6222f6052e12..302e06b0a733 100644
+--- a/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll
++++ b/llvm/test/CodeGen/Mips/ctselect-fallback-vector.ll
+@@ -6,21 +6,19 @@
+ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32:
+ ; MIPS64-MSA: # %bb.0:
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $7
++; MIPS64-MSA-NEXT: insert.d $w0[0], $7
++; MIPS64-MSA-NEXT: insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT: sll $1, $4, 0
+-; MIPS64-MSA-NEXT: ldi.b $w0, -1
+-; MIPS64-MSA-NEXT: fill.w $w1, $1
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $8
+-; MIPS64-MSA-NEXT: slli.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT: srai.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT: shf.w $w2, $w2, 177
+-; MIPS64-MSA-NEXT: xor.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $5
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $6
+-; MIPS64-MSA-NEXT: shf.w $w2, $w2, 177
++; MIPS64-MSA-NEXT: fill.w $w2, $1
++; MIPS64-MSA-NEXT: insert.d $w0[1], $8
++; MIPS64-MSA-NEXT: insert.d $w1[1], $6
++; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
++; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177
+ ; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT: jr $ra
+@@ -30,26 +28,24 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+ ; MIPS32-MSA: # %bb.0:
+ ; MIPS32-MSA-NEXT: lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT: lw $1, 28($sp)
++; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT: fill.w $w2, $4
+-; MIPS32-MSA-NEXT: ldi.b $w1, -1
+ ; MIPS32-MSA-NEXT: insert.w $w0[0], $2
++; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT: insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT: lw $1, 32($sp)
+-; MIPS32-MSA-NEXT: xor.v $w1, $w2, $w1
+ ; MIPS32-MSA-NEXT: insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT: lw $1, 16($sp)
+-; MIPS32-MSA-NEXT: and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+-; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT: insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 20($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w1[3], $1
+-; MIPS32-MSA-NEXT: and.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1]
+ ; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2]
+@@ -63,21 +59,19 @@ define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+ define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v8i16:
+ ; MIPS64-MSA: # %bb.0:
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $7
++; MIPS64-MSA-NEXT: insert.d $w0[0], $7
++; MIPS64-MSA-NEXT: insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT: sll $1, $4, 0
+-; MIPS64-MSA-NEXT: ldi.b $w0, -1
+-; MIPS64-MSA-NEXT: fill.h $w1, $1
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $8
+-; MIPS64-MSA-NEXT: slli.h $w1, $w1, 15
+-; MIPS64-MSA-NEXT: srai.h $w1, $w1, 15
+-; MIPS64-MSA-NEXT: shf.h $w2, $w2, 27
+-; MIPS64-MSA-NEXT: xor.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $5
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $6
+-; MIPS64-MSA-NEXT: shf.h $w2, $w2, 27
++; MIPS64-MSA-NEXT: fill.h $w2, $1
++; MIPS64-MSA-NEXT: insert.d $w0[1], $8
++; MIPS64-MSA-NEXT: insert.d $w1[1], $6
++; MIPS64-MSA-NEXT: slli.h $w2, $w2, 15
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT: srai.h $w2, $w2, 15
++; MIPS64-MSA-NEXT: shf.h $w0, $w0, 27
++; MIPS64-MSA-NEXT: shf.h $w1, $w1, 27
+ ; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: shf.h $w0, $w0, 27
+ ; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT: jr $ra
+@@ -87,28 +81,26 @@ define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) {
+ ; MIPS32-MSA: # %bb.0:
+ ; MIPS32-MSA-NEXT: lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT: lw $1, 28($sp)
+-; MIPS32-MSA-NEXT: fill.h $w1, $4
+-; MIPS32-MSA-NEXT: ldi.b $w0, -1
+-; MIPS32-MSA-NEXT: insert.w $w2[0], $2
+-; MIPS32-MSA-NEXT: slli.h $w1, $w1, 15
+-; MIPS32-MSA-NEXT: srai.h $w1, $w1, 15
+-; MIPS32-MSA-NEXT: insert.w $w2[1], $1
++; MIPS32-MSA-NEXT: insert.w $w1[0], $6
++; MIPS32-MSA-NEXT: fill.h $w2, $4
++; MIPS32-MSA-NEXT: insert.w $w0[0], $2
++; MIPS32-MSA-NEXT: insert.w $w1[1], $7
++; MIPS32-MSA-NEXT: slli.h $w2, $w2, 15
++; MIPS32-MSA-NEXT: srai.h $w2, $w2, 15
++; MIPS32-MSA-NEXT: insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT: lw $1, 32($sp)
+-; MIPS32-MSA-NEXT: xor.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT: insert.w $w2[2], $1
++; MIPS32-MSA-NEXT: insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 36($sp)
+-; MIPS32-MSA-NEXT: insert.w $w2[3], $1
++; MIPS32-MSA-NEXT: insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT: lw $1, 16($sp)
+-; MIPS32-MSA-NEXT: shf.h $w2, $w2, 177
+-; MIPS32-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT: insert.w $w2[0], $6
+-; MIPS32-MSA-NEXT: insert.w $w2[1], $7
+-; MIPS32-MSA-NEXT: insert.w $w2[2], $1
++; MIPS32-MSA-NEXT: insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 20($sp)
+-; MIPS32-MSA-NEXT: insert.w $w2[3], $1
+-; MIPS32-MSA-NEXT: shf.h $w2, $w2, 177
++; MIPS32-MSA-NEXT: insert.w $w1[3], $1
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT: shf.h $w0, $w0, 177
++; MIPS32-MSA-NEXT: shf.h $w1, $w1, 177
+ ; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS32-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: shf.h $w0, $w0, 177
+ ; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1]
+@@ -123,22 +115,21 @@ define <8 x i16> @test_ctselect_v8i16(i1 %cond, <8 x i16> %a, <8 x i16> %b) {
+ define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v16i8:
+ ; MIPS64-MSA: # %bb.0:
+-; MIPS64-MSA-NEXT: insert.d $w0[0], $5
+-; MIPS64-MSA-NEXT: insert.d $w1[0], $7
++; MIPS64-MSA-NEXT: insert.d $w0[0], $7
++; MIPS64-MSA-NEXT: insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT: sll $1, $4, 0
+ ; MIPS64-MSA-NEXT: fill.b $w2, $1
+-; MIPS64-MSA-NEXT: insert.d $w0[1], $6
+-; MIPS64-MSA-NEXT: insert.d $w1[1], $8
++; MIPS64-MSA-NEXT: insert.d $w0[1], $8
++; MIPS64-MSA-NEXT: insert.d $w1[1], $6
+ ; MIPS64-MSA-NEXT: slli.b $w2, $w2, 7
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
+ ; MIPS64-MSA-NEXT: shf.b $w0, $w0, 27
+-; MIPS64-MSA-NEXT: shf.b $w1, $w1, 27
+ ; MIPS64-MSA-NEXT: srai.b $w2, $w2, 7
++; MIPS64-MSA-NEXT: shf.b $w1, $w1, 27
+ ; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177
+-; MIPS64-MSA-NEXT: and.v $w0, $w2, $w0
+-; MIPS64-MSA-NEXT: xori.b $w2, $w2, 255
+-; MIPS64-MSA-NEXT: and.v $w1, $w2, $w1
+-; MIPS64-MSA-NEXT: or.v $w0, $w0, $w1
++; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: shf.b $w0, $w0, 27
+ ; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0]
+@@ -147,29 +138,28 @@ define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) {
+ ;
+ ; MIPS32-MSA-LABEL: test_ctselect_v16i8:
+ ; MIPS32-MSA: # %bb.0:
+-; MIPS32-MSA-NEXT: insert.w $w0[0], $6
+-; MIPS32-MSA-NEXT: lw $1, 16($sp)
+ ; MIPS32-MSA-NEXT: lw $2, 24($sp)
++; MIPS32-MSA-NEXT: lw $1, 28($sp)
++; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT: fill.b $w2, $4
+-; MIPS32-MSA-NEXT: insert.w $w0[1], $7
+-; MIPS32-MSA-NEXT: insert.w $w1[0], $2
++; MIPS32-MSA-NEXT: insert.w $w0[0], $2
++; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT: slli.b $w2, $w2, 7
+ ; MIPS32-MSA-NEXT: srai.b $w2, $w2, 7
++; MIPS32-MSA-NEXT: insert.w $w0[1], $1
++; MIPS32-MSA-NEXT: lw $1, 32($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w0[2], $1
+-; MIPS32-MSA-NEXT: lw $1, 20($sp)
++; MIPS32-MSA-NEXT: lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w0[3], $1
+-; MIPS32-MSA-NEXT: lw $1, 28($sp)
+-; MIPS32-MSA-NEXT: insert.w $w1[1], $1
+-; MIPS32-MSA-NEXT: lw $1, 32($sp)
+-; MIPS32-MSA-NEXT: shf.b $w0, $w0, 27
++; MIPS32-MSA-NEXT: lw $1, 16($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w1[2], $1
+-; MIPS32-MSA-NEXT: lw $1, 36($sp)
+-; MIPS32-MSA-NEXT: and.v $w0, $w2, $w0
+-; MIPS32-MSA-NEXT: xori.b $w2, $w2, 255
++; MIPS32-MSA-NEXT: lw $1, 20($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w1[3], $1
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT: shf.b $w0, $w0, 27
+ ; MIPS32-MSA-NEXT: shf.b $w1, $w1, 27
+-; MIPS32-MSA-NEXT: and.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT: or.v $w0, $w0, $w1
++; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: shf.b $w0, $w0, 27
+ ; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1]
+@@ -184,18 +174,16 @@ define <16 x i8> @test_ctselect_v16i8(i1 %cond, <16 x i8> %a, <16 x i8> %b) {
+ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v2i64:
+ ; MIPS64-MSA: # %bb.0:
+-; MIPS64-MSA-NEXT: fill.d $w2, $4
+ ; MIPS64-MSA-NEXT: insert.d $w0[0], $7
+-; MIPS64-MSA-NEXT: ldi.b $w1, -1
+-; MIPS64-MSA-NEXT: slli.d $w2, $w2, 63
+-; MIPS64-MSA-NEXT: insert.d $w0[1], $8
+-; MIPS64-MSA-NEXT: srai.d $w2, $w2, 63
+-; MIPS64-MSA-NEXT: xor.v $w1, $w2, $w1
+-; MIPS64-MSA-NEXT: and.v $w0, $w1, $w0
+ ; MIPS64-MSA-NEXT: insert.d $w1[0], $5
++; MIPS64-MSA-NEXT: fill.d $w2, $4
++; MIPS64-MSA-NEXT: insert.d $w0[1], $8
+ ; MIPS64-MSA-NEXT: insert.d $w1[1], $6
+-; MIPS64-MSA-NEXT: and.v $w1, $w2, $w1
+-; MIPS64-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT: slli.d $w2, $w2, 63
++; MIPS64-MSA-NEXT: srai.d $w2, $w2, 63
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT: jr $ra
+ ; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1]
+@@ -214,31 +202,28 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+ ; MIPS32-MSA-NEXT: and $sp, $sp, $1
+ ; MIPS32-MSA-NEXT: lw $2, 56($fp)
+ ; MIPS32-MSA-NEXT: lw $1, 60($fp)
++; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT: sw $4, 12($sp)
+ ; MIPS32-MSA-NEXT: sw $4, 4($sp)
+-; MIPS32-MSA-NEXT: ldi.b $w0, -1
+-; MIPS32-MSA-NEXT: ld.d $w1, 0($sp)
+-; MIPS32-MSA-NEXT: shf.w $w0, $w0, 177
+-; MIPS32-MSA-NEXT: insert.w $w2[0], $2
+-; MIPS32-MSA-NEXT: slli.d $w1, $w1, 63
+-; MIPS32-MSA-NEXT: insert.w $w2[1], $1
++; MIPS32-MSA-NEXT: ld.d $w2, 0($sp)
++; MIPS32-MSA-NEXT: insert.w $w0[0], $2
++; MIPS32-MSA-NEXT: insert.w $w1[1], $7
++; MIPS32-MSA-NEXT: slli.d $w2, $w2, 63
++; MIPS32-MSA-NEXT: insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT: lw $1, 64($fp)
+-; MIPS32-MSA-NEXT: srai.d $w1, $w1, 63
+-; MIPS32-MSA-NEXT: xor.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT: insert.w $w2[2], $1
++; MIPS32-MSA-NEXT: srai.d $w2, $w2, 63
++; MIPS32-MSA-NEXT: insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 68($fp)
+-; MIPS32-MSA-NEXT: insert.w $w2[3], $1
++; MIPS32-MSA-NEXT: insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT: lw $1, 48($fp)
+-; MIPS32-MSA-NEXT: shf.w $w2, $w2, 177
+-; MIPS32-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT: insert.w $w2[0], $6
+-; MIPS32-MSA-NEXT: insert.w $w2[1], $7
+-; MIPS32-MSA-NEXT: insert.w $w2[2], $1
++; MIPS32-MSA-NEXT: insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 52($fp)
+-; MIPS32-MSA-NEXT: insert.w $w2[3], $1
+-; MIPS32-MSA-NEXT: shf.w $w2, $w2, 177
++; MIPS32-MSA-NEXT: insert.w $w1[3], $1
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT: shf.w $w0, $w0, 177
++; MIPS32-MSA-NEXT: shf.w $w1, $w1, 177
+ ; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS32-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: shf.w $w0, $w0, 177
+ ; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1]
+@@ -257,21 +242,19 @@ define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4f32:
+ ; MIPS64-MSA: # %bb.0:
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $7
++; MIPS64-MSA-NEXT: insert.d $w0[0], $7
++; MIPS64-MSA-NEXT: insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT: sll $1, $4, 0
+-; MIPS64-MSA-NEXT: ldi.b $w0, -1
+-; MIPS64-MSA-NEXT: fill.w $w1, $1
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $8
+-; MIPS64-MSA-NEXT: slli.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT: srai.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT: shf.w $w2, $w2, 177
+-; MIPS64-MSA-NEXT: xor.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $5
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $6
+-; MIPS64-MSA-NEXT: shf.w $w2, $w2, 177
++; MIPS64-MSA-NEXT: fill.w $w2, $1
++; MIPS64-MSA-NEXT: insert.d $w0[1], $8
++; MIPS64-MSA-NEXT: insert.d $w1[1], $6
++; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
++; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177
+ ; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT: jr $ra
+@@ -281,26 +264,24 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b
+ ; MIPS32-MSA: # %bb.0:
+ ; MIPS32-MSA-NEXT: lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT: lw $1, 28($sp)
++; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT: fill.w $w2, $5
+-; MIPS32-MSA-NEXT: ldi.b $w1, -1
+ ; MIPS32-MSA-NEXT: insert.w $w0[0], $2
++; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT: insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT: lw $1, 32($sp)
+-; MIPS32-MSA-NEXT: xor.v $w1, $w2, $w1
+ ; MIPS32-MSA-NEXT: insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT: lw $1, 16($sp)
+-; MIPS32-MSA-NEXT: and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+-; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT: insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 20($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w1[3], $1
+-; MIPS32-MSA-NEXT: and.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: jr $ra
+ ; MIPS32-MSA-NEXT: st.w $w0, 0($4)
+ %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+@@ -311,18 +292,16 @@ define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b
+ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v2f64:
+ ; MIPS64-MSA: # %bb.0:
+-; MIPS64-MSA-NEXT: fill.d $w2, $4
+ ; MIPS64-MSA-NEXT: insert.d $w0[0], $7
+-; MIPS64-MSA-NEXT: ldi.b $w1, -1
+-; MIPS64-MSA-NEXT: slli.d $w2, $w2, 63
+-; MIPS64-MSA-NEXT: insert.d $w0[1], $8
+-; MIPS64-MSA-NEXT: srai.d $w2, $w2, 63
+-; MIPS64-MSA-NEXT: xor.v $w1, $w2, $w1
+-; MIPS64-MSA-NEXT: and.v $w0, $w1, $w0
+ ; MIPS64-MSA-NEXT: insert.d $w1[0], $5
++; MIPS64-MSA-NEXT: fill.d $w2, $4
++; MIPS64-MSA-NEXT: insert.d $w0[1], $8
+ ; MIPS64-MSA-NEXT: insert.d $w1[1], $6
+-; MIPS64-MSA-NEXT: and.v $w1, $w2, $w1
+-; MIPS64-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT: slli.d $w2, $w2, 63
++; MIPS64-MSA-NEXT: srai.d $w2, $w2, 63
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT: jr $ra
+ ; MIPS64-MSA-NEXT: copy_s.d $3, $w0[1]
+@@ -341,31 +320,28 @@ define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double>
+ ; MIPS32-MSA-NEXT: and $sp, $sp, $1
+ ; MIPS32-MSA-NEXT: lw $2, 56($fp)
+ ; MIPS32-MSA-NEXT: lw $1, 60($fp)
++; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT: sw $5, 12($sp)
+ ; MIPS32-MSA-NEXT: sw $5, 4($sp)
+-; MIPS32-MSA-NEXT: ldi.b $w0, -1
+-; MIPS32-MSA-NEXT: ld.d $w1, 0($sp)
+-; MIPS32-MSA-NEXT: shf.w $w0, $w0, 177
+-; MIPS32-MSA-NEXT: insert.w $w2[0], $2
+-; MIPS32-MSA-NEXT: slli.d $w1, $w1, 63
+-; MIPS32-MSA-NEXT: insert.w $w2[1], $1
++; MIPS32-MSA-NEXT: ld.d $w2, 0($sp)
++; MIPS32-MSA-NEXT: insert.w $w0[0], $2
++; MIPS32-MSA-NEXT: insert.w $w1[1], $7
++; MIPS32-MSA-NEXT: slli.d $w2, $w2, 63
++; MIPS32-MSA-NEXT: insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT: lw $1, 64($fp)
+-; MIPS32-MSA-NEXT: srai.d $w1, $w1, 63
+-; MIPS32-MSA-NEXT: xor.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT: insert.w $w2[2], $1
++; MIPS32-MSA-NEXT: srai.d $w2, $w2, 63
++; MIPS32-MSA-NEXT: insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 68($fp)
+-; MIPS32-MSA-NEXT: insert.w $w2[3], $1
++; MIPS32-MSA-NEXT: insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT: lw $1, 48($fp)
+-; MIPS32-MSA-NEXT: shf.w $w2, $w2, 177
+-; MIPS32-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT: insert.w $w2[0], $6
+-; MIPS32-MSA-NEXT: insert.w $w2[1], $7
+-; MIPS32-MSA-NEXT: insert.w $w2[2], $1
++; MIPS32-MSA-NEXT: insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 52($fp)
+-; MIPS32-MSA-NEXT: insert.w $w2[3], $1
+-; MIPS32-MSA-NEXT: shf.w $w2, $w2, 177
++; MIPS32-MSA-NEXT: insert.w $w1[3], $1
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT: shf.w $w0, $w0, 177
++; MIPS32-MSA-NEXT: shf.w $w1, $w1, 177
+ ; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS32-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: st.d $w0, 0($4)
+ ; MIPS32-MSA-NEXT: move $sp, $fp
+ ; MIPS32-MSA-NEXT: lw $fp, 24($sp) # 4-byte Folded Reload
+@@ -381,16 +357,14 @@ define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_aligned_load:
+ ; MIPS64-MSA: # %bb.0:
+ ; MIPS64-MSA-NEXT: sll $1, $4, 0
++; MIPS64-MSA-NEXT: ld.w $w0, 0($6)
+ ; MIPS64-MSA-NEXT: ld.w $w1, 0($5)
+-; MIPS64-MSA-NEXT: ldi.b $w2, -1
+-; MIPS64-MSA-NEXT: fill.w $w0, $1
+-; MIPS64-MSA-NEXT: slli.w $w0, $w0, 31
+-; MIPS64-MSA-NEXT: srai.w $w0, $w0, 31
+-; MIPS64-MSA-NEXT: and.v $w1, $w0, $w1
+-; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT: ld.w $w2, 0($6)
+-; MIPS64-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT: fill.w $w2, $1
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT: jr $ra
+@@ -398,16 +372,14 @@ define <4 x i32> @test_ctselect_v4i32_aligned_load(i1 %cond, ptr %p1, ptr %p2) {
+ ;
+ ; MIPS32-MSA-LABEL: test_ctselect_v4i32_aligned_load:
+ ; MIPS32-MSA: # %bb.0:
+-; MIPS32-MSA-NEXT: fill.w $w0, $4
++; MIPS32-MSA-NEXT: fill.w $w2, $4
++; MIPS32-MSA-NEXT: ld.w $w0, 0($6)
+ ; MIPS32-MSA-NEXT: ld.w $w1, 0($5)
+-; MIPS32-MSA-NEXT: ldi.b $w2, -1
+-; MIPS32-MSA-NEXT: slli.w $w0, $w0, 31
+-; MIPS32-MSA-NEXT: srai.w $w0, $w0, 31
+-; MIPS32-MSA-NEXT: and.v $w1, $w0, $w1
+-; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT: ld.w $w2, 0($6)
+-; MIPS32-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1]
+ ; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2]
+@@ -424,16 +396,14 @@ define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2)
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_unaligned_load:
+ ; MIPS64-MSA: # %bb.0:
+ ; MIPS64-MSA-NEXT: sll $1, $4, 0
++; MIPS64-MSA-NEXT: ld.w $w0, 0($6)
+ ; MIPS64-MSA-NEXT: ld.w $w1, 0($5)
+-; MIPS64-MSA-NEXT: ldi.b $w2, -1
+-; MIPS64-MSA-NEXT: fill.w $w0, $1
+-; MIPS64-MSA-NEXT: slli.w $w0, $w0, 31
+-; MIPS64-MSA-NEXT: srai.w $w0, $w0, 31
+-; MIPS64-MSA-NEXT: and.v $w1, $w0, $w1
+-; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT: ld.w $w2, 0($6)
+-; MIPS64-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT: fill.w $w2, $1
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT: jr $ra
+@@ -441,16 +411,14 @@ define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2)
+ ;
+ ; MIPS32-MSA-LABEL: test_ctselect_v4i32_unaligned_load:
+ ; MIPS32-MSA: # %bb.0:
+-; MIPS32-MSA-NEXT: fill.w $w0, $4
++; MIPS32-MSA-NEXT: fill.w $w2, $4
++; MIPS32-MSA-NEXT: ld.w $w0, 0($6)
+ ; MIPS32-MSA-NEXT: ld.w $w1, 0($5)
+-; MIPS32-MSA-NEXT: ldi.b $w2, -1
+-; MIPS32-MSA-NEXT: slli.w $w0, $w0, 31
+-; MIPS32-MSA-NEXT: srai.w $w0, $w0, 31
+-; MIPS32-MSA-NEXT: and.v $w1, $w0, $w1
+-; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT: ld.w $w2, 0($6)
+-; MIPS32-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS32-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1]
+ ; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2]
+@@ -466,21 +434,19 @@ define <4 x i32> @test_ctselect_v4i32_unaligned_load(i1 %cond, ptr %p1, ptr %p2)
+ define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr %out) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_store:
+ ; MIPS64-MSA: # %bb.0:
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $7
++; MIPS64-MSA-NEXT: insert.d $w0[0], $7
++; MIPS64-MSA-NEXT: insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT: sll $1, $4, 0
+-; MIPS64-MSA-NEXT: ldi.b $w0, -1
+-; MIPS64-MSA-NEXT: fill.w $w1, $1
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $8
+-; MIPS64-MSA-NEXT: slli.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT: srai.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT: shf.w $w2, $w2, 177
+-; MIPS64-MSA-NEXT: xor.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $5
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $6
+-; MIPS64-MSA-NEXT: shf.w $w2, $w2, 177
++; MIPS64-MSA-NEXT: fill.w $w2, $1
++; MIPS64-MSA-NEXT: insert.d $w0[1], $8
++; MIPS64-MSA-NEXT: insert.d $w1[1], $6
++; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
++; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177
+ ; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: jr $ra
+ ; MIPS64-MSA-NEXT: st.w $w0, 0($9)
+ ;
+@@ -488,27 +454,25 @@ define void @test_ctselect_v4i32_store(i1 %cond, <4 x i32> %a, <4 x i32> %b, ptr
+ ; MIPS32-MSA: # %bb.0:
+ ; MIPS32-MSA-NEXT: lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT: lw $1, 28($sp)
++; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT: fill.w $w2, $4
+-; MIPS32-MSA-NEXT: ldi.b $w1, -1
+ ; MIPS32-MSA-NEXT: insert.w $w0[0], $2
++; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT: insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT: lw $1, 32($sp)
+-; MIPS32-MSA-NEXT: xor.v $w1, $w2, $w1
+ ; MIPS32-MSA-NEXT: insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT: lw $1, 16($sp)
+-; MIPS32-MSA-NEXT: and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+-; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT: insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 20($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w1[3], $1
+ ; MIPS32-MSA-NEXT: lw $1, 40($sp)
+-; MIPS32-MSA-NEXT: and.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: jr $ra
+ ; MIPS32-MSA-NEXT: st.w $w0, 0($1)
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+@@ -521,31 +485,28 @@ define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a,
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_chain:
+ ; MIPS64-MSA: # %bb.0:
+ ; MIPS64-MSA-NEXT: insert.d $w0[0], $8
++; MIPS64-MSA-NEXT: insert.d $w1[0], $6
+ ; MIPS64-MSA-NEXT: sll $1, $4, 0
+-; MIPS64-MSA-NEXT: ldi.b $w1, -1
+ ; MIPS64-MSA-NEXT: fill.w $w2, $1
+ ; MIPS64-MSA-NEXT: sll $1, $5, 0
+ ; MIPS64-MSA-NEXT: insert.d $w0[1], $9
++; MIPS64-MSA-NEXT: insert.d $w1[1], $7
+ ; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
+ ; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31
+ ; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
+-; MIPS64-MSA-NEXT: xor.v $w3, $w2, $w1
+-; MIPS64-MSA-NEXT: and.v $w0, $w3, $w0
+-; MIPS64-MSA-NEXT: insert.d $w3[0], $6
+-; MIPS64-MSA-NEXT: insert.d $w3[1], $7
+-; MIPS64-MSA-NEXT: shf.w $w3, $w3, 177
+-; MIPS64-MSA-NEXT: and.v $w2, $w2, $w3
+-; MIPS64-MSA-NEXT: or.v $w0, $w2, $w0
++; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177
++; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
+ ; MIPS64-MSA-NEXT: fill.w $w2, $1
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
++; MIPS64-MSA-NEXT: insert.d $w1[0], $10
+ ; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: insert.d $w1[1], $11
+ ; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31
+-; MIPS64-MSA-NEXT: and.v $w0, $w2, $w0
+-; MIPS64-MSA-NEXT: xor.v $w1, $w2, $w1
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $10
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $11
+-; MIPS64-MSA-NEXT: shf.w $w2, $w2, 177
+-; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT: or.v $w0, $w0, $w1
++; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
++; MIPS64-MSA-NEXT: and.v $w0, $w0, $w2
++; MIPS64-MSA-NEXT: xor.v $w0, $w1, $w0
+ ; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT: jr $ra
+@@ -555,41 +516,38 @@ define <4 x i32> @test_ctselect_v4i32_chain(i1 %cond1, i1 %cond2, <4 x i32> %a,
+ ; MIPS32-MSA: # %bb.0:
+ ; MIPS32-MSA-NEXT: lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT: lw $1, 28($sp)
++; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT: fill.w $w2, $4
+-; MIPS32-MSA-NEXT: ldi.b $w1, -1
+ ; MIPS32-MSA-NEXT: insert.w $w0[0], $2
++; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT: lw $2, 40($sp)
+ ; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT: insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT: lw $1, 32($sp)
+-; MIPS32-MSA-NEXT: xor.v $w3, $w2, $w1
+ ; MIPS32-MSA-NEXT: insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT: lw $1, 16($sp)
+-; MIPS32-MSA-NEXT: and.v $w0, $w3, $w0
+-; MIPS32-MSA-NEXT: insert.w $w3[0], $6
+-; MIPS32-MSA-NEXT: insert.w $w3[1], $7
+-; MIPS32-MSA-NEXT: insert.w $w3[2], $1
++; MIPS32-MSA-NEXT: insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 20($sp)
+-; MIPS32-MSA-NEXT: insert.w $w3[3], $1
++; MIPS32-MSA-NEXT: insert.w $w1[3], $1
+ ; MIPS32-MSA-NEXT: lw $1, 44($sp)
+-; MIPS32-MSA-NEXT: and.v $w2, $w2, $w3
+-; MIPS32-MSA-NEXT: or.v $w0, $w2, $w0
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
+ ; MIPS32-MSA-NEXT: fill.w $w2, $5
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
++; MIPS32-MSA-NEXT: insert.w $w1[0], $2
+ ; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31
+-; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31
+-; MIPS32-MSA-NEXT: and.v $w0, $w2, $w0
+-; MIPS32-MSA-NEXT: xor.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT: insert.w $w2[0], $2
+-; MIPS32-MSA-NEXT: insert.w $w2[1], $1
++; MIPS32-MSA-NEXT: insert.w $w1[1], $1
+ ; MIPS32-MSA-NEXT: lw $1, 48($sp)
+-; MIPS32-MSA-NEXT: insert.w $w2[2], $1
++; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS32-MSA-NEXT: insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 52($sp)
+-; MIPS32-MSA-NEXT: insert.w $w2[3], $1
+-; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS32-MSA-NEXT: or.v $w0, $w0, $w1
++; MIPS32-MSA-NEXT: insert.w $w1[3], $1
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
++; MIPS32-MSA-NEXT: and.v $w0, $w0, $w2
++; MIPS32-MSA-NEXT: xor.v $w0, $w1, $w0
+ ; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1]
+ ; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2]
+@@ -607,20 +565,18 @@ define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4
+ ; MIPS64-MSA-NEXT: insert.d $w0[0], $7
+ ; MIPS64-MSA-NEXT: insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT: sll $1, $4, 0
+-; MIPS64-MSA-NEXT: fill.w $w3, $1
+ ; MIPS64-MSA-NEXT: insert.d $w0[1], $8
+ ; MIPS64-MSA-NEXT: insert.d $w1[1], $6
+-; MIPS64-MSA-NEXT: slli.w $w3, $w3, 31
+ ; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177
+-; MIPS64-MSA-NEXT: srai.w $w3, $w3, 31
+ ; MIPS64-MSA-NEXT: fadd.w $w2, $w1, $w0
+ ; MIPS64-MSA-NEXT: fsub.w $w0, $w1, $w0
+-; MIPS64-MSA-NEXT: ldi.b $w1, -1
+-; MIPS64-MSA-NEXT: xor.v $w1, $w3, $w1
+-; MIPS64-MSA-NEXT: and.v $w2, $w3, $w2
+-; MIPS64-MSA-NEXT: and.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT: or.v $w0, $w2, $w0
++; MIPS64-MSA-NEXT: xor.v $w1, $w2, $w0
++; MIPS64-MSA-NEXT: fill.w $w2, $1
++; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT: jr $ra
+@@ -631,11 +587,8 @@ define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4
+ ; MIPS32-MSA-NEXT: lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT: lw $1, 28($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+-; MIPS32-MSA-NEXT: fill.w $w3, $5
+ ; MIPS32-MSA-NEXT: insert.w $w0[0], $2
+ ; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+-; MIPS32-MSA-NEXT: slli.w $w3, $w3, 31
+-; MIPS32-MSA-NEXT: srai.w $w3, $w3, 31
+ ; MIPS32-MSA-NEXT: insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT: lw $1, 32($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w0[2], $1
+@@ -647,11 +600,12 @@ define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4
+ ; MIPS32-MSA-NEXT: insert.w $w1[3], $1
+ ; MIPS32-MSA-NEXT: fadd.w $w2, $w1, $w0
+ ; MIPS32-MSA-NEXT: fsub.w $w0, $w1, $w0
+-; MIPS32-MSA-NEXT: ldi.b $w1, -1
+-; MIPS32-MSA-NEXT: xor.v $w1, $w3, $w1
+-; MIPS32-MSA-NEXT: and.v $w2, $w3, $w2
+-; MIPS32-MSA-NEXT: and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT: or.v $w0, $w2, $w0
++; MIPS32-MSA-NEXT: xor.v $w1, $w2, $w0
++; MIPS32-MSA-NEXT: fill.w $w2, $5
++; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: jr $ra
+ ; MIPS32-MSA-NEXT: st.w $w0, 0($4)
+ %sum = fadd <4 x float> %x, %y
+@@ -664,36 +618,32 @@ define <4 x float> @test_ctselect_v4f32_arithmetic(i1 %cond, <4 x float> %x, <4
+ define void @test_ctselect_v4i32_mixed(i1 %cond, ptr %p1, ptr %p2, ptr %out) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_mixed:
+ ; MIPS64-MSA: # %bb.0:
++; MIPS64-MSA-NEXT: ld.w $w0, 0($6)
++; MIPS64-MSA-NEXT: ld.w $w1, 0($5)
+ ; MIPS64-MSA-NEXT: sll $1, $4, 0
+-; MIPS64-MSA-NEXT: ld.w $w0, 0($5)
+-; MIPS64-MSA-NEXT: ldi.b $w2, -1
+-; MIPS64-MSA-NEXT: fill.w $w1, $1
+-; MIPS64-MSA-NEXT: addvi.w $w0, $w0, 1
+-; MIPS64-MSA-NEXT: slli.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT: srai.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT: and.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT: ld.w $w2, 0($6)
+-; MIPS64-MSA-NEXT: addvi.w $w2, $w2, 2
++; MIPS64-MSA-NEXT: fill.w $w2, $1
++; MIPS64-MSA-NEXT: addvi.w $w0, $w0, 2
++; MIPS64-MSA-NEXT: addvi.w $w1, $w1, 1
++; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31
+ ; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT: or.v $w0, $w0, $w1
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: jr $ra
+ ; MIPS64-MSA-NEXT: st.w $w0, 0($7)
+ ;
+ ; MIPS32-MSA-LABEL: test_ctselect_v4i32_mixed:
+ ; MIPS32-MSA: # %bb.0:
+-; MIPS32-MSA-NEXT: ld.w $w0, 0($5)
+-; MIPS32-MSA-NEXT: fill.w $w1, $4
+-; MIPS32-MSA-NEXT: ldi.b $w2, -1
+-; MIPS32-MSA-NEXT: slli.w $w1, $w1, 31
+-; MIPS32-MSA-NEXT: addvi.w $w0, $w0, 1
+-; MIPS32-MSA-NEXT: srai.w $w1, $w1, 31
+-; MIPS32-MSA-NEXT: and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w2
+-; MIPS32-MSA-NEXT: ld.w $w2, 0($6)
+-; MIPS32-MSA-NEXT: addvi.w $w2, $w2, 2
++; MIPS32-MSA-NEXT: ld.w $w0, 0($6)
++; MIPS32-MSA-NEXT: ld.w $w1, 0($5)
++; MIPS32-MSA-NEXT: fill.w $w2, $4
++; MIPS32-MSA-NEXT: addvi.w $w0, $w0, 2
++; MIPS32-MSA-NEXT: addvi.w $w1, $w1, 1
++; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
+ ; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS32-MSA-NEXT: or.v $w0, $w0, $w1
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: jr $ra
+ ; MIPS32-MSA-NEXT: st.w $w0, 0($7)
+ %a = load <4 x i32>, ptr %p1, align 16
+@@ -709,21 +659,19 @@ define void @test_ctselect_v4i32_mixed(i1 %cond, ptr %p1, ptr %p2, ptr %out) {
+ define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b) nounwind {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_args:
+ ; MIPS64-MSA: # %bb.0:
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $7
++; MIPS64-MSA-NEXT: insert.d $w0[0], $7
++; MIPS64-MSA-NEXT: insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT: sll $1, $4, 0
+-; MIPS64-MSA-NEXT: ldi.b $w0, -1
+-; MIPS64-MSA-NEXT: fill.w $w1, $1
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $8
+-; MIPS64-MSA-NEXT: slli.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT: srai.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT: shf.w $w2, $w2, 177
+-; MIPS64-MSA-NEXT: xor.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $5
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $6
+-; MIPS64-MSA-NEXT: shf.w $w2, $w2, 177
++; MIPS64-MSA-NEXT: fill.w $w2, $1
++; MIPS64-MSA-NEXT: insert.d $w0[1], $8
++; MIPS64-MSA-NEXT: insert.d $w1[1], $6
++; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
++; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177
+ ; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0]
+ ; MIPS64-MSA-NEXT: jr $ra
+@@ -733,26 +681,24 @@ define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+ ; MIPS32-MSA: # %bb.0:
+ ; MIPS32-MSA-NEXT: lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT: lw $1, 28($sp)
++; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT: fill.w $w2, $4
+-; MIPS32-MSA-NEXT: ldi.b $w1, -1
+ ; MIPS32-MSA-NEXT: insert.w $w0[0], $2
++; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT: insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT: lw $1, 32($sp)
+-; MIPS32-MSA-NEXT: xor.v $w1, $w2, $w1
+ ; MIPS32-MSA-NEXT: insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT: lw $1, 16($sp)
+-; MIPS32-MSA-NEXT: and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+-; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT: insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 20($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w1[3], $1
+-; MIPS32-MSA-NEXT: and.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1]
+ ; MIPS32-MSA-NEXT: copy_s.w $4, $w0[2]
+@@ -766,21 +712,19 @@ define <4 x i32> @test_ctselect_v4i32_args(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+ define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+ ; MIPS64-MSA-LABEL: test_ctselect_v4i32_multi_use:
+ ; MIPS64-MSA: # %bb.0:
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $7
++; MIPS64-MSA-NEXT: insert.d $w0[0], $7
++; MIPS64-MSA-NEXT: insert.d $w1[0], $5
+ ; MIPS64-MSA-NEXT: sll $1, $4, 0
+-; MIPS64-MSA-NEXT: ldi.b $w0, -1
+-; MIPS64-MSA-NEXT: fill.w $w1, $1
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $8
+-; MIPS64-MSA-NEXT: slli.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT: srai.w $w1, $w1, 31
+-; MIPS64-MSA-NEXT: shf.w $w2, $w2, 177
+-; MIPS64-MSA-NEXT: xor.v $w0, $w1, $w0
+-; MIPS64-MSA-NEXT: and.v $w0, $w0, $w2
+-; MIPS64-MSA-NEXT: insert.d $w2[0], $5
+-; MIPS64-MSA-NEXT: insert.d $w2[1], $6
+-; MIPS64-MSA-NEXT: shf.w $w2, $w2, 177
++; MIPS64-MSA-NEXT: fill.w $w2, $1
++; MIPS64-MSA-NEXT: insert.d $w0[1], $8
++; MIPS64-MSA-NEXT: insert.d $w1[1], $6
++; MIPS64-MSA-NEXT: slli.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS64-MSA-NEXT: srai.w $w2, $w2, 31
++; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
++; MIPS64-MSA-NEXT: shf.w $w1, $w1, 177
+ ; MIPS64-MSA-NEXT: and.v $w1, $w1, $w2
+-; MIPS64-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS64-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS64-MSA-NEXT: addv.w $w0, $w0, $w0
+ ; MIPS64-MSA-NEXT: shf.w $w0, $w0, 177
+ ; MIPS64-MSA-NEXT: copy_s.d $2, $w0[0]
+@@ -791,26 +735,24 @@ define <4 x i32> @test_ctselect_v4i32_multi_use(i1 %cond, <4 x i32> %a, <4 x i32
+ ; MIPS32-MSA: # %bb.0:
+ ; MIPS32-MSA-NEXT: lw $2, 24($sp)
+ ; MIPS32-MSA-NEXT: lw $1, 28($sp)
++; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+ ; MIPS32-MSA-NEXT: fill.w $w2, $4
+-; MIPS32-MSA-NEXT: ldi.b $w1, -1
+ ; MIPS32-MSA-NEXT: insert.w $w0[0], $2
++; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT: slli.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT: srai.w $w2, $w2, 31
+ ; MIPS32-MSA-NEXT: insert.w $w0[1], $1
+ ; MIPS32-MSA-NEXT: lw $1, 32($sp)
+-; MIPS32-MSA-NEXT: xor.v $w1, $w2, $w1
+ ; MIPS32-MSA-NEXT: insert.w $w0[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 36($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w0[3], $1
+ ; MIPS32-MSA-NEXT: lw $1, 16($sp)
+-; MIPS32-MSA-NEXT: and.v $w0, $w1, $w0
+-; MIPS32-MSA-NEXT: insert.w $w1[0], $6
+-; MIPS32-MSA-NEXT: insert.w $w1[1], $7
+ ; MIPS32-MSA-NEXT: insert.w $w1[2], $1
+ ; MIPS32-MSA-NEXT: lw $1, 20($sp)
+ ; MIPS32-MSA-NEXT: insert.w $w1[3], $1
+-; MIPS32-MSA-NEXT: and.v $w1, $w2, $w1
+-; MIPS32-MSA-NEXT: or.v $w0, $w1, $w0
++; MIPS32-MSA-NEXT: xor.v $w1, $w1, $w0
++; MIPS32-MSA-NEXT: and.v $w1, $w1, $w2
++; MIPS32-MSA-NEXT: xor.v $w0, $w0, $w1
+ ; MIPS32-MSA-NEXT: addv.w $w0, $w0, $w0
+ ; MIPS32-MSA-NEXT: copy_s.w $2, $w0[0]
+ ; MIPS32-MSA-NEXT: copy_s.w $3, $w0[1]
+diff --git a/llvm/test/CodeGen/Mips/ctselect-fallback.ll b/llvm/test/CodeGen/Mips/ctselect-fallback.ll
+index d89d7fc69871..6a61412367f7 100644
+--- a/llvm/test/CodeGen/Mips/ctselect-fallback.ll
++++ b/llvm/test/CodeGen/Mips/ctselect-fallback.ll
+@@ -11,7 +11,7 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+ ; M32-NEXT: negu $2, $2
+ ; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: xor $2, $1, $6
++; M32-NEXT: xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_i8:
+ ; M64: # %bb.0:
+@@ -23,7 +23,7 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+ ; M64-NEXT: and $1, $2, $1
+ ; M64-NEXT: sll $2, $6, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: xor $2, $1, $2
++; M64-NEXT: xor $2, $2, $1
+ %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+ ret i8 %result
+ }
+@@ -36,7 +36,7 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+ ; M32-NEXT: negu $2, $2
+ ; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: xor $2, $1, $6
++; M32-NEXT: xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_i16:
+ ; M64: # %bb.0:
+@@ -48,7 +48,7 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+ ; M64-NEXT: and $1, $2, $1
+ ; M64-NEXT: sll $2, $6, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: xor $2, $1, $2
++; M64-NEXT: xor $2, $2, $1
+ %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+ ret i16 %result
+ }
+@@ -56,26 +56,24 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
+ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_i32:
+ ; M32: # %bb.0:
+-; M32-NEXT: andi $1, $4, 1
+-; M32-NEXT: negu $2, $1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $2, $5
+-; M32-NEXT: and $1, $1, $6
++; M32-NEXT: andi $2, $4, 1
++; M32-NEXT: xor $1, $5, $6
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_i32:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $4, 0
+-; M64-NEXT: sll $3, $5, 0
++; M64-NEXT: xor $2, $5, $6
+ ; M64-NEXT: andi $1, $1, 1
+-; M64-NEXT: negu $2, $1
+-; M64-NEXT: addiu $1, $1, -1
+-; M64-NEXT: and $2, $2, $3
+-; M64-NEXT: sll $3, $6, 0
+-; M64-NEXT: and $1, $1, $3
++; M64-NEXT: sll $2, $2, 0
++; M64-NEXT: negu $1, $1
++; M64-NEXT: and $1, $2, $1
++; M64-NEXT: sll $2, $6, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $2, $1
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+ }
+@@ -88,22 +86,21 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
+ ; M32-NEXT: negu $3, $3
+ ; M32-NEXT: xor $2, $6, $1
+ ; M32-NEXT: and $2, $2, $3
+-; M32-NEXT: xor $2, $2, $1
++; M32-NEXT: xor $2, $1, $2
+ ; M32-NEXT: lw $1, 20($sp)
+ ; M32-NEXT: xor $4, $7, $1
+ ; M32-NEXT: and $3, $4, $3
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: xor $3, $3, $1
++; M32-NEXT: xor $3, $1, $3
+ ;
+ ; M64-LABEL: test_ctselect_i64:
+ ; M64: # %bb.0:
+-; M64-NEXT: andi $1, $4, 1
+-; M64-NEXT: dnegu $2, $1
+-; M64-NEXT: daddiu $1, $1, -1
+-; M64-NEXT: and $2, $2, $5
+-; M64-NEXT: and $1, $1, $6
++; M64-NEXT: andi $2, $4, 1
++; M64-NEXT: xor $1, $5, $6
++; M64-NEXT: dnegu $2, $2
++; M64-NEXT: and $1, $1, $2
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $6, $1
+ %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+ ret i64 %result
+ }
+@@ -111,23 +108,21 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
+ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
+ ; M32-LABEL: test_ctselect_ptr:
+ ; M32: # %bb.0:
+-; M32-NEXT: andi $1, $4, 1
+-; M32-NEXT: negu $2, $1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $2, $5
+-; M32-NEXT: and $1, $1, $6
++; M32-NEXT: andi $2, $4, 1
++; M32-NEXT: xor $1, $5, $6
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_ctselect_ptr:
+ ; M64: # %bb.0:
+-; M64-NEXT: andi $1, $4, 1
+-; M64-NEXT: dnegu $2, $1
+-; M64-NEXT: daddiu $1, $1, -1
+-; M64-NEXT: and $2, $2, $5
+-; M64-NEXT: and $1, $1, $6
++; M64-NEXT: andi $2, $4, 1
++; M64-NEXT: xor $1, $5, $6
++; M64-NEXT: dnegu $2, $2
++; M64-NEXT: and $1, $1, $2
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $6, $1
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+ ret ptr %result
+ }
+@@ -151,13 +146,12 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_const_false:
+ ; M32: # %bb.0:
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $zero, $5
++; M32-NEXT: move $2, $5
+ ;
+ ; M64-LABEL: test_ctselect_const_false:
+ ; M64: # %bb.0:
+-; M64-NEXT: sll $1, $5, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $zero, $1
++; M64-NEXT: sll $2, $5, 0
+ %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+ ret i32 %result
+ }
+@@ -166,29 +160,27 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
+ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_icmp_eq:
+ ; M32: # %bb.0:
+-; M32-NEXT: xor $1, $4, $5
+-; M32-NEXT: sltu $1, $zero, $1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $1, $6
+-; M32-NEXT: not $1, $1
+-; M32-NEXT: and $1, $1, $7
++; M32-NEXT: xor $2, $4, $5
++; M32-NEXT: xor $1, $6, $7
++; M32-NEXT: sltiu $2, $2, 1
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $7, $1
+ ;
+ ; M64-LABEL: test_ctselect_icmp_eq:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $5, 0
+ ; M64-NEXT: sll $2, $4, 0
+-; M64-NEXT: sll $3, $7, 0
+ ; M64-NEXT: xor $1, $2, $1
+-; M64-NEXT: sll $2, $6, 0
+-; M64-NEXT: sltu $1, $zero, $1
+-; M64-NEXT: addiu $1, $1, -1
+-; M64-NEXT: and $2, $1, $2
+-; M64-NEXT: not $1, $1
+-; M64-NEXT: and $1, $1, $3
++; M64-NEXT: xor $2, $6, $7
++; M64-NEXT: sltiu $1, $1, 1
++; M64-NEXT: sll $2, $2, 0
++; M64-NEXT: negu $1, $1
++; M64-NEXT: and $1, $2, $1
++; M64-NEXT: sll $2, $7, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $2, $1
+ %cond = icmp eq i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+@@ -197,29 +189,27 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
+ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_icmp_ne:
+ ; M32: # %bb.0:
+-; M32-NEXT: xor $1, $4, $5
+-; M32-NEXT: sltiu $1, $1, 1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $1, $6
+-; M32-NEXT: not $1, $1
+-; M32-NEXT: and $1, $1, $7
++; M32-NEXT: xor $2, $4, $5
++; M32-NEXT: xor $1, $6, $7
++; M32-NEXT: sltu $2, $zero, $2
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $7, $1
+ ;
+ ; M64-LABEL: test_ctselect_icmp_ne:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $5, 0
+ ; M64-NEXT: sll $2, $4, 0
+-; M64-NEXT: sll $3, $7, 0
+ ; M64-NEXT: xor $1, $2, $1
+-; M64-NEXT: sll $2, $6, 0
+-; M64-NEXT: sltiu $1, $1, 1
+-; M64-NEXT: addiu $1, $1, -1
+-; M64-NEXT: and $2, $1, $2
+-; M64-NEXT: not $1, $1
+-; M64-NEXT: and $1, $1, $3
++; M64-NEXT: xor $2, $6, $7
++; M64-NEXT: sltu $1, $zero, $1
++; M64-NEXT: sll $2, $2, 0
++; M64-NEXT: negu $1, $1
++; M64-NEXT: and $1, $2, $1
++; M64-NEXT: sll $2, $7, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $2, $1
+ %cond = icmp ne i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+@@ -228,29 +218,25 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_icmp_slt:
+ ; M32: # %bb.0:
+-; M32-NEXT: slt $1, $4, $5
+-; M32-NEXT: xori $1, $1, 1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $1, $6
+-; M32-NEXT: not $1, $1
+-; M32-NEXT: and $1, $1, $7
++; M32-NEXT: slt $2, $4, $5
++; M32-NEXT: xor $1, $6, $7
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $7, $1
+ ;
+ ; M64-LABEL: test_ctselect_icmp_slt:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $5, 0
+ ; M64-NEXT: sll $2, $4, 0
+-; M64-NEXT: sll $3, $7, 0
+ ; M64-NEXT: slt $1, $2, $1
+-; M64-NEXT: sll $2, $6, 0
+-; M64-NEXT: xori $1, $1, 1
+-; M64-NEXT: addiu $1, $1, -1
+-; M64-NEXT: and $2, $1, $2
+-; M64-NEXT: not $1, $1
+-; M64-NEXT: and $1, $1, $3
++; M64-NEXT: xor $2, $6, $7
++; M64-NEXT: negu $1, $1
++; M64-NEXT: sll $2, $2, 0
++; M64-NEXT: and $1, $2, $1
++; M64-NEXT: sll $2, $7, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $2, $1
+ %cond = icmp slt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+@@ -259,29 +245,25 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
+ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+ ; M32-LABEL: test_ctselect_icmp_ult:
+ ; M32: # %bb.0:
+-; M32-NEXT: sltu $1, $4, $5
+-; M32-NEXT: xori $1, $1, 1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $1, $6
+-; M32-NEXT: not $1, $1
+-; M32-NEXT: and $1, $1, $7
++; M32-NEXT: sltu $2, $4, $5
++; M32-NEXT: xor $1, $6, $7
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $7, $1
+ ;
+ ; M64-LABEL: test_ctselect_icmp_ult:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $5, 0
+ ; M64-NEXT: sll $2, $4, 0
+-; M64-NEXT: sll $3, $7, 0
+ ; M64-NEXT: sltu $1, $2, $1
+-; M64-NEXT: sll $2, $6, 0
+-; M64-NEXT: xori $1, $1, 1
+-; M64-NEXT: addiu $1, $1, -1
+-; M64-NEXT: and $2, $1, $2
+-; M64-NEXT: not $1, $1
+-; M64-NEXT: and $1, $1, $3
++; M64-NEXT: xor $2, $6, $7
++; M64-NEXT: negu $1, $1
++; M64-NEXT: sll $2, $2, 0
++; M64-NEXT: and $1, $2, $1
++; M64-NEXT: sll $2, $7, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $2, $1
+ %cond = icmp ult i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+@@ -291,28 +273,26 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+ ; M32-LABEL: test_ctselect_load:
+ ; M32: # %bb.0:
+-; M32-NEXT: andi $1, $4, 1
++; M32-NEXT: lw $2, 0($6)
+ ; M32-NEXT: lw $3, 0($5)
+-; M32-NEXT: negu $2, $1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $2, $3
+-; M32-NEXT: lw $3, 0($6)
+-; M32-NEXT: and $1, $1, $3
++; M32-NEXT: andi $1, $4, 1
++; M32-NEXT: negu $1, $1
++; M32-NEXT: xor $3, $3, $2
++; M32-NEXT: and $1, $3, $1
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $2, $1
+ ;
+ ; M64-LABEL: test_ctselect_load:
+ ; M64: # %bb.0:
+-; M64-NEXT: sll $1, $4, 0
+-; M64-NEXT: lw $3, 0($5)
+-; M64-NEXT: andi $1, $1, 1
+-; M64-NEXT: negu $2, $1
+-; M64-NEXT: addiu $1, $1, -1
++; M64-NEXT: sll $3, $4, 0
++; M64-NEXT: lw $1, 0($6)
++; M64-NEXT: lw $2, 0($5)
++; M64-NEXT: andi $3, $3, 1
++; M64-NEXT: xor $2, $2, $1
++; M64-NEXT: negu $3, $3
+ ; M64-NEXT: and $2, $2, $3
+-; M64-NEXT: lw $3, 0($6)
+-; M64-NEXT: and $1, $1, $3
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $1, $2
+ %a = load i32, ptr %p1
+ %b = load i32, ptr %p2
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+@@ -323,41 +303,37 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+ ; M32-LABEL: test_ctselect_nested:
+ ; M32: # %bb.0:
+-; M32-NEXT: andi $1, $5, 1
++; M32-NEXT: andi $2, $5, 1
++; M32-NEXT: xor $1, $6, $7
+ ; M32-NEXT: andi $3, $4, 1
+-; M32-NEXT: negu $2, $1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: negu $4, $3
+-; M32-NEXT: and $2, $2, $6
+-; M32-NEXT: and $1, $1, $7
+-; M32-NEXT: or $1, $2, $1
+-; M32-NEXT: addiu $2, $3, -1
+-; M32-NEXT: lw $3, 16($sp)
+-; M32-NEXT: and $1, $4, $1
+-; M32-NEXT: and $2, $2, $3
++; M32-NEXT: negu $2, $2
++; M32-NEXT: negu $3, $3
++; M32-NEXT: and $1, $1, $2
++; M32-NEXT: lw $2, 16($sp)
++; M32-NEXT: xor $1, $7, $1
++; M32-NEXT: xor $1, $1, $2
++; M32-NEXT: and $1, $1, $3
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $1, $2
++; M32-NEXT: xor $2, $2, $1
+ ;
+ ; M64-LABEL: test_ctselect_nested:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $5, 0
+-; M64-NEXT: sll $3, $6, 0
+-; M64-NEXT: sll $4, $4, 0
++; M64-NEXT: xor $2, $6, $7
++; M64-NEXT: sll $3, $4, 0
+ ; M64-NEXT: andi $1, $1, 1
+-; M64-NEXT: andi $4, $4, 1
+-; M64-NEXT: negu $2, $1
+-; M64-NEXT: addiu $1, $1, -1
+-; M64-NEXT: negu $5, $4
+-; M64-NEXT: and $2, $2, $3
+-; M64-NEXT: sll $3, $7, 0
+-; M64-NEXT: and $1, $1, $3
+-; M64-NEXT: addiu $3, $4, -1
+-; M64-NEXT: or $1, $2, $1
++; M64-NEXT: sll $2, $2, 0
++; M64-NEXT: andi $3, $3, 1
++; M64-NEXT: negu $1, $1
++; M64-NEXT: negu $3, $3
++; M64-NEXT: and $1, $2, $1
++; M64-NEXT: sll $2, $7, 0
++; M64-NEXT: xor $1, $2, $1
+ ; M64-NEXT: sll $2, $8, 0
+-; M64-NEXT: and $1, $5, $1
+-; M64-NEXT: and $2, $3, $2
++; M64-NEXT: xor $1, $1, $2
++; M64-NEXT: and $1, $1, $3
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $1, $2
++; M64-NEXT: xor $2, $2, $1
+ %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+ %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
+ ret i32 %result
+diff --git a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll
+index 6cfa07afdd51..069100e2d2a7 100644
+--- a/llvm/test/CodeGen/Mips/ctselect-side-effects.ll
++++ b/llvm/test/CodeGen/Mips/ctselect-side-effects.ll
+@@ -38,26 +38,24 @@ define i32 @test_constant_fold() {
+ define i32 @test_protected_no_branch(i1 %cond, i32 %a, i32 %b) {
+ ; M32-LABEL: test_protected_no_branch:
+ ; M32: # %bb.0:
+-; M32-NEXT: andi $1, $4, 1
+-; M32-NEXT: negu $2, $1
+-; M32-NEXT: addiu $1, $1, -1
+-; M32-NEXT: and $2, $2, $5
+-; M32-NEXT: and $1, $1, $6
++; M32-NEXT: andi $2, $4, 1
++; M32-NEXT: xor $1, $5, $6
++; M32-NEXT: negu $2, $2
++; M32-NEXT: and $1, $1, $2
+ ; M32-NEXT: jr $ra
+-; M32-NEXT: or $2, $2, $1
++; M32-NEXT: xor $2, $6, $1
+ ;
+ ; M64-LABEL: test_protected_no_branch:
+ ; M64: # %bb.0:
+ ; M64-NEXT: sll $1, $4, 0
+-; M64-NEXT: sll $3, $5, 0
++; M64-NEXT: xor $2, $5, $6
+ ; M64-NEXT: andi $1, $1, 1
+-; M64-NEXT: negu $2, $1
+-; M64-NEXT: addiu $1, $1, -1
+-; M64-NEXT: and $2, $2, $3
+-; M64-NEXT: sll $3, $6, 0
+-; M64-NEXT: and $1, $1, $3
++; M64-NEXT: sll $2, $2, 0
++; M64-NEXT: negu $1, $1
++; M64-NEXT: and $1, $2, $1
++; M64-NEXT: sll $2, $6, 0
+ ; M64-NEXT: jr $ra
+-; M64-NEXT: or $2, $2, $1
++; M64-NEXT: xor $2, $2, $1
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+ }
+diff --git a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
+index d4617c7e75da..ee8072703ee3 100644
+--- a/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
++++ b/llvm/test/CodeGen/RISCV/ctselect-fallback.ll
+@@ -101,8 +101,6 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
+ ;
+ ; RV32-LABEL: test_ctselect_const_true:
+ ; RV32: # %bb.0:
+-; RV32-NEXT: xor a0, a0, a1
+-; RV32-NEXT: xor a0, a1, a0
+ ; RV32-NEXT: ret
+ %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+ ret i32 %result
+@@ -208,7 +206,7 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ ; RV64-LABEL: test_ctselect_nested_and_i1_to_i32:
+ ; RV64: # %bb.0:
+-; RV64-NEXT: and a0, a1, a0
++; RV64-NEXT: and a0, a0, a1
+ ; RV64-NEXT: xor a2, a2, a3
+ ; RV64-NEXT: slli a0, a0, 63
+ ; RV64-NEXT: srai a0, a0, 63
+@@ -218,7 +216,7 @@ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ ;
+ ; RV32-LABEL: test_ctselect_nested_and_i1_to_i32:
+ ; RV32: # %bb.0:
+-; RV32-NEXT: and a0, a1, a0
++; RV32-NEXT: and a0, a0, a1
+ ; RV32-NEXT: xor a2, a2, a3
+ ; RV32-NEXT: slli a0, a0, 31
+ ; RV32-NEXT: srai a0, a0, 31
+@@ -265,8 +263,8 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) {
+ ; RV64-LABEL: test_ctselect_double_nested_and_i1:
+ ; RV64: # %bb.0:
+-; RV64-NEXT: and a1, a2, a1
+-; RV64-NEXT: and a0, a1, a0
++; RV64-NEXT: and a0, a0, a1
++; RV64-NEXT: and a0, a0, a2
+ ; RV64-NEXT: xor a3, a3, a4
+ ; RV64-NEXT: slli a0, a0, 63
+ ; RV64-NEXT: srai a0, a0, 63
+@@ -276,8 +274,8 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
+ ;
+ ; RV32-LABEL: test_ctselect_double_nested_and_i1:
+ ; RV32: # %bb.0:
+-; RV32-NEXT: and a1, a2, a1
+-; RV32-NEXT: and a0, a1, a0
++; RV32-NEXT: and a0, a0, a1
++; RV32-NEXT: and a0, a0, a2
+ ; RV32-NEXT: xor a3, a3, a4
+ ; RV32-NEXT: slli a0, a0, 31
+ ; RV32-NEXT: srai a0, a0, 31
+@@ -295,7 +293,7 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
+ define i32 @test_ctselect_double_nested_mixed_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y, i32 %z) {
+ ; RV64-LABEL: test_ctselect_double_nested_mixed_i1:
+ ; RV64: # %bb.0:
+-; RV64-NEXT: and a0, a1, a0
++; RV64-NEXT: and a0, a0, a1
+ ; RV64-NEXT: xor a3, a3, a4
+ ; RV64-NEXT: or a0, a0, a2
+ ; RV64-NEXT: slli a0, a0, 63
+@@ -309,7 +307,7 @@ define i32 @test_ctselect_double_nested_mixed_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x,
+ ;
+ ; RV32-LABEL: test_ctselect_double_nested_mixed_i1:
+ ; RV32: # %bb.0:
+-; RV32-NEXT: and a0, a1, a0
++; RV32-NEXT: and a0, a0, a1
+ ; RV32-NEXT: xor a3, a3, a4
+ ; RV32-NEXT: or a0, a0, a2
+ ; RV32-NEXT: slli a0, a0, 31
+@@ -382,7 +380,7 @@ define float @test_ctselect_f32_nan_inf(i1 %cond) {
+ ; RV32-NEXT: srai a0, a0, 31
+ ; RV32-NEXT: and a0, a0, a1
+ ; RV32-NEXT: lui a1, 522240
+-; RV32-NEXT: xor a0, a0, a1
++; RV32-NEXT: or a0, a0, a1
+ ; RV32-NEXT: ret
+ %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
+ ret float %result
+@@ -398,7 +396,7 @@ define double @test_ctselect_f64_nan_inf(i1 %cond) {
+ ; RV64-NEXT: and a0, a0, a1
+ ; RV64-NEXT: li a1, 2047
+ ; RV64-NEXT: slli a1, a1, 52
+-; RV64-NEXT: xor a0, a0, a1
++; RV64-NEXT: or a0, a0, a1
+ ; RV64-NEXT: ret
+ ;
+ ; RV32-LABEL: test_ctselect_f64_nan_inf:
+diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
+index bf65e04721df..e1abae80cef4 100644
+--- a/llvm/test/CodeGen/X86/ctselect.ll
++++ b/llvm/test/CodeGen/X86/ctselect.ll
+@@ -9,8 +9,8 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+ ; X64-LABEL: test_ctselect_i8:
+ ; X64: # %bb.0:
+ ; X64-NEXT: movl %edi, %eax
+-; X64-NEXT: xorl %edx, %esi
+ ; X64-NEXT: andb $1, %al
++; X64-NEXT: xorl %edx, %esi
+ ; X64-NEXT: negb %al
+ ; X64-NEXT: andb %sil, %al
+ ; X64-NEXT: xorb %dl, %al
+@@ -20,10 +20,10 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+ ; X32-LABEL: test_ctselect_i8:
+ ; X32: # %bb.0:
+ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
++; X32-NEXT: andb $1, %al
+ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT: xorb %cl, %dl
+-; X32-NEXT: andb $1, %al
+ ; X32-NEXT: negb %al
+ ; X32-NEXT: andb %dl, %al
+ ; X32-NEXT: xorb %cl, %al
+@@ -32,10 +32,10 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
+ ; X32-NOCMOV-LABEL: test_ctselect_i8:
+ ; X32-NOCMOV: # %bb.0:
+ ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT: andb $1, %al
+ ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT: xorb %cl, %dl
+-; X32-NOCMOV-NEXT: andb $1, %al
+ ; X32-NOCMOV-NEXT: negb %al
+ ; X32-NOCMOV-NEXT: andb %dl, %al
+ ; X32-NOCMOV-NEXT: xorb %cl, %al
+@@ -58,10 +58,11 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+ ; X32-LABEL: test_ctselect_i32:
+ ; X32: # %bb.0:
+ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
++; X32-NEXT: andb $1, %al
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT: xorl %ecx, %edx
+-; X32-NEXT: andl $1, %eax
++; X32-NEXT: movzbl %al, %eax
+ ; X32-NEXT: negl %eax
+ ; X32-NEXT: andl %edx, %eax
+ ; X32-NEXT: xorl %ecx, %eax
+@@ -70,10 +71,11 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
+ ; X32-NOCMOV-LABEL: test_ctselect_i32:
+ ; X32-NOCMOV: # %bb.0:
+ ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT: andb $1, %al
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT: xorl %ecx, %edx
+-; X32-NOCMOV-NEXT: andl $1, %eax
++; X32-NOCMOV-NEXT: movzbl %al, %eax
+ ; X32-NOCMOV-NEXT: negl %eax
+ ; X32-NOCMOV-NEXT: andl %edx, %eax
+ ; X32-NOCMOV-NEXT: xorl %ecx, %eax
+@@ -95,45 +97,57 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
+ ;
+ ; X32-LABEL: test_ctselect_i64:
+ ; X32: # %bb.0:
+-; X32-NEXT: pushl %esi
++; X32-NEXT: pushl %edi
+ ; X32-NEXT: .cfi_def_cfa_offset 8
+-; X32-NEXT: .cfi_offset %esi, -8
+-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
++; X32-NEXT: pushl %esi
++; X32-NEXT: .cfi_def_cfa_offset 12
++; X32-NEXT: .cfi_offset %esi, -12
++; X32-NEXT: .cfi_offset %edi, -8
++; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edx
++; X32-NEXT: andb $1, %dl
++; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+-; X32-NEXT: xorl %edx, %eax
+-; X32-NEXT: andl $1, %esi
+-; X32-NEXT: negl %esi
+-; X32-NEXT: andl %esi, %eax
+-; X32-NEXT: xorl %edx, %eax
++; X32-NEXT: xorl %esi, %eax
++; X32-NEXT: movzbl %dl, %edi
++; X32-NEXT: negl %edi
++; X32-NEXT: andl %edi, %eax
++; X32-NEXT: xorl %esi, %eax
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT: xorl %ecx, %edx
+-; X32-NEXT: andl %esi, %edx
++; X32-NEXT: andl %edi, %edx
+ ; X32-NEXT: xorl %ecx, %edx
+ ; X32-NEXT: popl %esi
++; X32-NEXT: .cfi_def_cfa_offset 8
++; X32-NEXT: popl %edi
+ ; X32-NEXT: .cfi_def_cfa_offset 4
+ ; X32-NEXT: retl
+ ;
+ ; X32-NOCMOV-LABEL: test_ctselect_i64:
+ ; X32-NOCMOV: # %bb.0:
+-; X32-NOCMOV-NEXT: pushl %esi
++; X32-NOCMOV-NEXT: pushl %edi
+ ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+-; X32-NOCMOV-NEXT: .cfi_offset %esi, -8
+-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
++; X32-NOCMOV-NEXT: pushl %esi
++; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
++; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
++; X32-NOCMOV-NEXT: .cfi_offset %edi, -8
++; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx
++; X32-NOCMOV-NEXT: andb $1, %dl
++; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+-; X32-NOCMOV-NEXT: xorl %edx, %eax
+-; X32-NOCMOV-NEXT: andl $1, %esi
+-; X32-NOCMOV-NEXT: negl %esi
+-; X32-NOCMOV-NEXT: andl %esi, %eax
+-; X32-NOCMOV-NEXT: xorl %edx, %eax
++; X32-NOCMOV-NEXT: xorl %esi, %eax
++; X32-NOCMOV-NEXT: movzbl %dl, %edi
++; X32-NOCMOV-NEXT: negl %edi
++; X32-NOCMOV-NEXT: andl %edi, %eax
++; X32-NOCMOV-NEXT: xorl %esi, %eax
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT: xorl %ecx, %edx
+-; X32-NOCMOV-NEXT: andl %esi, %edx
++; X32-NOCMOV-NEXT: andl %edi, %edx
+ ; X32-NOCMOV-NEXT: xorl %ecx, %edx
+ ; X32-NOCMOV-NEXT: popl %esi
++; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
++; X32-NOCMOV-NEXT: popl %edi
+ ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+ ; X32-NOCMOV-NEXT: retl
+ %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
+@@ -155,37 +169,47 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
+ ;
+ ; X32-LABEL: test_ctselect_f32:
+ ; X32: # %bb.0:
+-; X32-NEXT: pushl %eax
+-; X32-NEXT: .cfi_def_cfa_offset 8
++; X32-NEXT: subl $12, %esp
++; X32-NEXT: .cfi_def_cfa_offset 16
++; X32-NEXT: flds {{[0-9]+}}(%esp)
++; X32-NEXT: fstps {{[0-9]+}}(%esp)
++; X32-NEXT: flds {{[0-9]+}}(%esp)
++; X32-NEXT: fstps (%esp)
+ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
++; X32-NEXT: andb $1, %al
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+-; X32-NEXT: xorl %ecx, %edx
+-; X32-NEXT: andl $1, %eax
++; X32-NEXT: movzbl %al, %eax
+ ; X32-NEXT: negl %eax
+-; X32-NEXT: andl %edx, %eax
+-; X32-NEXT: xorl %ecx, %eax
+-; X32-NEXT: movl %eax, (%esp)
+-; X32-NEXT: flds (%esp)
+-; X32-NEXT: popl %eax
++; X32-NEXT: movl (%esp), %edx
++; X32-NEXT: xorl %ecx, %edx
++; X32-NEXT: andl %eax, %edx
++; X32-NEXT: xorl %ecx, %edx
++; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
++; X32-NEXT: flds {{[0-9]+}}(%esp)
++; X32-NEXT: addl $12, %esp
+ ; X32-NEXT: .cfi_def_cfa_offset 4
+ ; X32-NEXT: retl
+ ;
+ ; X32-NOCMOV-LABEL: test_ctselect_f32:
+ ; X32-NOCMOV: # %bb.0:
+-; X32-NOCMOV-NEXT: pushl %eax
+-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
++; X32-NOCMOV-NEXT: subl $12, %esp
++; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
++; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT: fstps (%esp)
+ ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT: andb $1, %al
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+-; X32-NOCMOV-NEXT: xorl %ecx, %edx
+-; X32-NOCMOV-NEXT: andl $1, %eax
++; X32-NOCMOV-NEXT: movzbl %al, %eax
+ ; X32-NOCMOV-NEXT: negl %eax
+-; X32-NOCMOV-NEXT: andl %edx, %eax
+-; X32-NOCMOV-NEXT: xorl %ecx, %eax
+-; X32-NOCMOV-NEXT: movl %eax, (%esp)
+-; X32-NOCMOV-NEXT: flds (%esp)
+-; X32-NOCMOV-NEXT: popl %eax
++; X32-NOCMOV-NEXT: movl (%esp), %edx
++; X32-NOCMOV-NEXT: xorl %ecx, %edx
++; X32-NOCMOV-NEXT: andl %eax, %edx
++; X32-NOCMOV-NEXT: xorl %ecx, %edx
++; X32-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT: addl $12, %esp
+ ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+ ; X32-NOCMOV-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+@@ -281,10 +305,11 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
+ ; X32-LABEL: test_ctselect_ptr:
+ ; X32: # %bb.0:
+ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
++; X32-NEXT: andb $1, %al
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT: xorl %ecx, %edx
+-; X32-NEXT: andl $1, %eax
++; X32-NEXT: movzbl %al, %eax
+ ; X32-NEXT: negl %eax
+ ; X32-NEXT: andl %edx, %eax
+ ; X32-NEXT: xorl %ecx, %eax
+@@ -293,10 +318,11 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
+ ; X32-NOCMOV-LABEL: test_ctselect_ptr:
+ ; X32-NOCMOV: # %bb.0:
+ ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT: andb $1, %al
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT: xorl %ecx, %edx
+-; X32-NOCMOV-NEXT: andl $1, %eax
++; X32-NOCMOV-NEXT: movzbl %al, %eax
+ ; X32-NOCMOV-NEXT: negl %eax
+ ; X32-NOCMOV-NEXT: andl %edx, %eax
+ ; X32-NOCMOV-NEXT: xorl %ecx, %eax
+@@ -310,24 +336,16 @@ define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
+ ; X64-LABEL: test_ctselect_const_true:
+ ; X64: # %bb.0:
+ ; X64-NEXT: movl %edi, %eax
+-; X64-NEXT: xorl %esi, %eax
+-; X64-NEXT: xorl %esi, %eax
+ ; X64-NEXT: retq
+ ;
+ ; X32-LABEL: test_ctselect_const_true:
+ ; X32: # %bb.0:
+-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+-; X32-NEXT: xorl %ecx, %eax
+-; X32-NEXT: xorl %ecx, %eax
+ ; X32-NEXT: retl
+ ;
+ ; X32-NOCMOV-LABEL: test_ctselect_const_true:
+ ; X32-NOCMOV: # %bb.0:
+-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+-; X32-NOCMOV-NEXT: xorl %ecx, %eax
+-; X32-NOCMOV-NEXT: xorl %ecx, %eax
+ ; X32-NOCMOV-NEXT: retl
+ %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+ ret i32 %result
+@@ -341,14 +359,12 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
+ ;
+ ; X32-LABEL: test_ctselect_const_false:
+ ; X32: # %bb.0:
+-; X32-NEXT: xorl %eax, %eax
+-; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax
++; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+ ; X32-NEXT: retl
+ ;
+ ; X32-NOCMOV-LABEL: test_ctselect_const_false:
+ ; X32-NOCMOV: # %bb.0:
+-; X32-NOCMOV-NEXT: xorl %eax, %eax
+-; X32-NOCMOV-NEXT: xorl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+ ; X32-NOCMOV-NEXT: retl
+ %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+ ret i32 %result
+@@ -443,19 +459,20 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
+ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
+ ; X64-LABEL: test_ctselect_fcmp_oeq:
+ ; X64: # %bb.0:
+-; X64-NEXT: movd %xmm3, %eax
+ ; X64-NEXT: cmpeqss %xmm1, %xmm0
+-; X64-NEXT: pxor %xmm3, %xmm2
+-; X64-NEXT: pand %xmm0, %xmm2
+-; X64-NEXT: movd %xmm2, %ecx
+-; X64-NEXT: xorl %eax, %ecx
+-; X64-NEXT: movd %ecx, %xmm0
++; X64-NEXT: xorps %xmm3, %xmm2
++; X64-NEXT: andps %xmm2, %xmm0
++; X64-NEXT: xorps %xmm3, %xmm0
+ ; X64-NEXT: retq
+ ;
+ ; X32-LABEL: test_ctselect_fcmp_oeq:
+ ; X32: # %bb.0:
+-; X32-NEXT: pushl %eax
+-; X32-NEXT: .cfi_def_cfa_offset 8
++; X32-NEXT: subl $12, %esp
++; X32-NEXT: .cfi_def_cfa_offset 16
++; X32-NEXT: flds {{[0-9]+}}(%esp)
++; X32-NEXT: fstps {{[0-9]+}}(%esp)
++; X32-NEXT: flds {{[0-9]+}}(%esp)
++; X32-NEXT: fstps (%esp)
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+ ; X32-NEXT: flds {{[0-9]+}}(%esp)
+ ; X32-NEXT: flds {{[0-9]+}}(%esp)
+@@ -466,20 +483,24 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
+ ; X32-NEXT: andb %cl, %dl
+ ; X32-NEXT: movzbl %dl, %ecx
+ ; X32-NEXT: negl %ecx
+-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
++; X32-NEXT: movl (%esp), %edx
+ ; X32-NEXT: xorl %eax, %edx
+ ; X32-NEXT: andl %ecx, %edx
+ ; X32-NEXT: xorl %eax, %edx
+-; X32-NEXT: movl %edx, (%esp)
+-; X32-NEXT: flds (%esp)
+-; X32-NEXT: popl %eax
++; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
++; X32-NEXT: flds {{[0-9]+}}(%esp)
++; X32-NEXT: addl $12, %esp
+ ; X32-NEXT: .cfi_def_cfa_offset 4
+ ; X32-NEXT: retl
+ ;
+ ; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq:
+ ; X32-NOCMOV: # %bb.0:
+-; X32-NOCMOV-NEXT: pushl %eax
+-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
++; X32-NOCMOV-NEXT: subl $12, %esp
++; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
++; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT: fstps (%esp)
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+ ; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+@@ -492,13 +513,13 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
+ ; X32-NOCMOV-NEXT: andb %al, %dl
+ ; X32-NOCMOV-NEXT: movzbl %dl, %eax
+ ; X32-NOCMOV-NEXT: negl %eax
+-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
++; X32-NOCMOV-NEXT: movl (%esp), %edx
+ ; X32-NOCMOV-NEXT: xorl %ecx, %edx
+ ; X32-NOCMOV-NEXT: andl %eax, %edx
+ ; X32-NOCMOV-NEXT: xorl %ecx, %edx
+-; X32-NOCMOV-NEXT: movl %edx, (%esp)
+-; X32-NOCMOV-NEXT: flds (%esp)
+-; X32-NOCMOV-NEXT: popl %eax
++; X32-NOCMOV-NEXT: movl %edx, {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
++; X32-NOCMOV-NEXT: addl $12, %esp
+ ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+ ; X32-NOCMOV-NEXT: retl
+ %cond = fcmp oeq float %x, %y
+@@ -522,12 +543,13 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+ ; X32-LABEL: test_ctselect_load:
+ ; X32: # %bb.0:
+ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
++; X32-NEXT: andb $1, %al
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT: movl (%edx), %edx
+ ; X32-NEXT: movl (%ecx), %ecx
+ ; X32-NEXT: xorl %edx, %ecx
+-; X32-NEXT: andl $1, %eax
++; X32-NEXT: movzbl %al, %eax
+ ; X32-NEXT: negl %eax
+ ; X32-NEXT: andl %ecx, %eax
+ ; X32-NEXT: xorl %edx, %eax
+@@ -536,12 +558,13 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
+ ; X32-NOCMOV-LABEL: test_ctselect_load:
+ ; X32-NOCMOV: # %bb.0:
+ ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT: andb $1, %al
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT: movl (%edx), %edx
+ ; X32-NOCMOV-NEXT: movl (%ecx), %ecx
+ ; X32-NOCMOV-NEXT: xorl %edx, %ecx
+-; X32-NOCMOV-NEXT: andl $1, %eax
++; X32-NOCMOV-NEXT: movzbl %al, %eax
+ ; X32-NOCMOV-NEXT: negl %eax
+ ; X32-NOCMOV-NEXT: andl %ecx, %eax
+ ; X32-NOCMOV-NEXT: xorl %edx, %eax
+@@ -578,17 +601,19 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+ ; X32-NEXT: .cfi_offset %esi, -12
+ ; X32-NEXT: .cfi_offset %edi, -8
+ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
++; X32-NEXT: andb $1, %al
++; X32-NEXT: movb {{[0-9]+}}(%esp), %ah
++; X32-NEXT: andb $1, %ah
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+-; X32-NEXT: xorl %edx, %edi
+-; X32-NEXT: andl $1, %esi
+-; X32-NEXT: negl %esi
+-; X32-NEXT: andl %edi, %esi
++; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
++; X32-NEXT: xorl %edx, %esi
++; X32-NEXT: movzbl %ah, %edi
++; X32-NEXT: negl %edi
++; X32-NEXT: andl %esi, %edi
+ ; X32-NEXT: xorl %ecx, %edx
+-; X32-NEXT: xorl %esi, %edx
+-; X32-NEXT: andl $1, %eax
++; X32-NEXT: xorl %edi, %edx
++; X32-NEXT: movzbl %al, %eax
+ ; X32-NEXT: negl %eax
+ ; X32-NEXT: andl %edx, %eax
+ ; X32-NEXT: xorl %ecx, %eax
+@@ -607,17 +632,19 @@ define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
+ ; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+ ; X32-NOCMOV-NEXT: .cfi_offset %edi, -8
+ ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
++; X32-NOCMOV-NEXT: andb $1, %al
++; X32-NOCMOV-NEXT: movb {{[0-9]+}}(%esp), %ah
++; X32-NOCMOV-NEXT: andb $1, %ah
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+-; X32-NOCMOV-NEXT: xorl %edx, %edi
+-; X32-NOCMOV-NEXT: andl $1, %esi
+-; X32-NOCMOV-NEXT: negl %esi
+-; X32-NOCMOV-NEXT: andl %edi, %esi
++; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
++; X32-NOCMOV-NEXT: xorl %edx, %esi
++; X32-NOCMOV-NEXT: movzbl %ah, %edi
++; X32-NOCMOV-NEXT: negl %edi
++; X32-NOCMOV-NEXT: andl %esi, %edi
+ ; X32-NOCMOV-NEXT: xorl %ecx, %edx
+-; X32-NOCMOV-NEXT: xorl %esi, %edx
+-; X32-NOCMOV-NEXT: andl $1, %eax
++; X32-NOCMOV-NEXT: xorl %edi, %edx
++; X32-NOCMOV-NEXT: movzbl %al, %eax
+ ; X32-NOCMOV-NEXT: negl %eax
+ ; X32-NOCMOV-NEXT: andl %edx, %eax
+ ; X32-NOCMOV-NEXT: xorl %ecx, %eax
+@@ -651,10 +678,10 @@ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+ ; X32-NEXT: andb {{[0-9]+}}(%esp), %al
+-; X32-NEXT: movzbl %al, %eax
++; X32-NEXT: andb $1, %al
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT: xorl %ecx, %edx
+-; X32-NEXT: andl $1, %eax
++; X32-NEXT: movzbl %al, %eax
+ ; X32-NEXT: negl %eax
+ ; X32-NEXT: andl %edx, %eax
+ ; X32-NEXT: xorl %ecx, %eax
+@@ -665,10 +692,10 @@ define i32 @test_ctselect_nested_and_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+ ; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al
+-; X32-NOCMOV-NEXT: movzbl %al, %eax
++; X32-NOCMOV-NEXT: andb $1, %al
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT: xorl %ecx, %edx
+-; X32-NOCMOV-NEXT: andl $1, %eax
++; X32-NOCMOV-NEXT: movzbl %al, %eax
+ ; X32-NOCMOV-NEXT: negl %eax
+ ; X32-NOCMOV-NEXT: andl %edx, %eax
+ ; X32-NOCMOV-NEXT: xorl %ecx, %eax
+@@ -699,10 +726,10 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+ ; X32-NEXT: orb {{[0-9]+}}(%esp), %al
+-; X32-NEXT: movzbl %al, %eax
++; X32-NEXT: andb $1, %al
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT: xorl %ecx, %edx
+-; X32-NEXT: andl $1, %eax
++; X32-NEXT: movzbl %al, %eax
+ ; X32-NEXT: negl %eax
+ ; X32-NEXT: andl %edx, %eax
+ ; X32-NEXT: xorl %ecx, %eax
+@@ -713,10 +740,10 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+ ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+ ; X32-NOCMOV-NEXT: orb {{[0-9]+}}(%esp), %al
+-; X32-NOCMOV-NEXT: movzbl %al, %eax
++; X32-NOCMOV-NEXT: andb $1, %al
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT: xorl %ecx, %edx
+-; X32-NOCMOV-NEXT: andl $1, %eax
++; X32-NOCMOV-NEXT: movzbl %al, %eax
+ ; X32-NOCMOV-NEXT: negl %eax
+ ; X32-NOCMOV-NEXT: andl %edx, %eax
+ ; X32-NOCMOV-NEXT: xorl %ecx, %eax
+@@ -735,9 +762,9 @@ define i32 @test_ctselect_nested_or_i1_to_i32(i1 %c0, i1 %c1, i32 %x, i32 %y) {
+ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i32 %y) {
+ ; X64-LABEL: test_ctselect_double_nested_and_i1:
+ ; X64: # %bb.0:
+-; X64-NEXT: movl %esi, %eax
++; X64-NEXT: movl %edi, %eax
++; X64-NEXT: andl %esi, %eax
+ ; X64-NEXT: andl %edx, %eax
+-; X64-NEXT: andl %edi, %eax
+ ; X64-NEXT: xorl %r8d, %ecx
+ ; X64-NEXT: andl $1, %eax
+ ; X64-NEXT: negl %eax
+@@ -751,10 +778,10 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
+ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+ ; X32-NEXT: andb {{[0-9]+}}(%esp), %al
+ ; X32-NEXT: andb {{[0-9]+}}(%esp), %al
+-; X32-NEXT: movzbl %al, %eax
++; X32-NEXT: andb $1, %al
+ ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NEXT: xorl %ecx, %edx
+-; X32-NEXT: andl $1, %eax
++; X32-NEXT: movzbl %al, %eax
+ ; X32-NEXT: negl %eax
+ ; X32-NEXT: andl %edx, %eax
+ ; X32-NEXT: xorl %ecx, %eax
+@@ -766,10 +793,10 @@ define i32 @test_ctselect_double_nested_and_i1(i1 %c0, i1 %c1, i1 %c2, i32 %x, i
+ ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+ ; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al
+ ; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al
+-; X32-NOCMOV-NEXT: movzbl %al, %eax
++; X32-NOCMOV-NEXT: andb $1, %al
+ ; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+ ; X32-NOCMOV-NEXT: xorl %ecx, %edx
+-; X32-NOCMOV-NEXT: andl $1, %eax
++; X32-NOCMOV-NEXT: movzbl %al, %eax
+ ; X32-NOCMOV-NEXT: negl %eax
+ ; X32-NOCMOV-NEXT: andl %edx, %eax
+ ; X32-NOCMOV-NEXT: xorl %ecx, %eax
+@@ -1403,7 +1430,7 @@ define float @test_ctselect_f32_nan_inf(i1 %cond) {
+ ; X64-NEXT: andl $1, %edi
+ ; X64-NEXT: negl %edi
+ ; X64-NEXT: andl $4194304, %edi # imm = 0x400000
+-; X64-NEXT: xorl $2139095040, %edi # imm = 0x7F800000
++; X64-NEXT: orl $2139095040, %edi # imm = 0x7F800000
+ ; X64-NEXT: movd %edi, %xmm0
+ ; X64-NEXT: retq
+ ;
+@@ -1412,10 +1439,11 @@ define float @test_ctselect_f32_nan_inf(i1 %cond) {
+ ; X32-NEXT: pushl %eax
+ ; X32-NEXT: .cfi_def_cfa_offset 8
+ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+-; X32-NEXT: andl $1, %eax
++; X32-NEXT: andb $1, %al
++; X32-NEXT: movzbl %al, %eax
+ ; X32-NEXT: negl %eax
+ ; X32-NEXT: andl $4194304, %eax # imm = 0x400000
+-; X32-NEXT: xorl $2139095040, %eax # imm = 0x7F800000
++; X32-NEXT: orl $2139095040, %eax # imm = 0x7F800000
+ ; X32-NEXT: movl %eax, (%esp)
+ ; X32-NEXT: flds (%esp)
+ ; X32-NEXT: popl %eax
+@@ -1427,10 +1455,11 @@ define float @test_ctselect_f32_nan_inf(i1 %cond) {
+ ; X32-NOCMOV-NEXT: pushl %eax
+ ; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+ ; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+-; X32-NOCMOV-NEXT: andl $1, %eax
++; X32-NOCMOV-NEXT: andb $1, %al
++; X32-NOCMOV-NEXT: movzbl %al, %eax
+ ; X32-NOCMOV-NEXT: negl %eax
+ ; X32-NOCMOV-NEXT: andl $4194304, %eax # imm = 0x400000
+-; X32-NOCMOV-NEXT: xorl $2139095040, %eax # imm = 0x7F800000
++; X32-NOCMOV-NEXT: orl $2139095040, %eax # imm = 0x7F800000
+ ; X32-NOCMOV-NEXT: movl %eax, (%esp)
+ ; X32-NOCMOV-NEXT: flds (%esp)
+ ; X32-NOCMOV-NEXT: popl %eax
+@@ -1449,7 +1478,7 @@ define double @test_ctselect_f64_nan_inf(i1 %cond) {
+ ; X64-NEXT: movabsq $2251799813685248, %rax # imm = 0x8000000000000
+ ; X64-NEXT: andq %rdi, %rax
+ ; X64-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+-; X64-NEXT: xorq %rax, %rcx
++; X64-NEXT: orq %rax, %rcx
+ ; X64-NEXT: movq %rcx, %xmm0
+ ; X64-NEXT: retq
+ ;
>From 257f7d254671bc21d342c28d4660b6ba7a23ea56 Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 23:56:12 -0500
Subject: [PATCH 2/2] [LLVM][X86] Add f80 support for ct.select
Add special handling for x86_fp80 types in CTSELECT lowering by splitting
them into three 32-bit chunks, performing constant-time selection on each
chunk, and reassembling the result. This fixes crashes when compiling
tests with f80 types.
Also updated ctselect.ll to match current generic fallback implementation.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 63 ++
llvm/lib/Target/X86/X86InstrInfo.cpp | 919 +++++++++++-----------
llvm/lib/Target/X86/X86InstrInfo.h | 21 +-
llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 272 +++----
4 files changed, 663 insertions(+), 612 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 401c1953323f4..7a3bb3c648fbb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -26170,6 +26170,69 @@ SDValue X86TargetLowering::LowerCT_SELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBitcast(VT, CtSelect);
}
+ // Handle f80 types by splitting into three 32-bit chunks
+ if (VT == MVT::f80) {
+ SDValue Chain = DAG.getEntryNode();
+
+ // Create temporary stack slots for input f80 values
+ SDValue TrueSlot = DAG.CreateStackTemporary(MVT::f80);
+ SDValue FalseSlot = DAG.CreateStackTemporary(MVT::f80);
+
+ // Store f80 values to memory
+ SDValue StoreTrueF80 =
+ DAG.getStore(Chain, DL, TrueOp, TrueSlot, MachinePointerInfo());
+ SDValue StoreFalseF80 =
+ DAG.getStore(Chain, DL, FalseOp, FalseSlot, MachinePointerInfo());
+
+ // Load i32 parts from memory (3 chunks for 96-bit f80 storage)
+ SDValue TruePart0 =
+ DAG.getLoad(MVT::i32, DL, StoreTrueF80, TrueSlot, MachinePointerInfo());
+ SDValue TruePart1Ptr =
+ DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(4), DL);
+ SDValue TruePart1 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart1Ptr,
+ MachinePointerInfo());
+ SDValue TruePart2Ptr =
+ DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(8), DL);
+ SDValue TruePart2 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart2Ptr,
+ MachinePointerInfo());
+
+ SDValue FalsePart0 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalseSlot,
+ MachinePointerInfo());
+ SDValue FalsePart1Ptr =
+ DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(4), DL);
+ SDValue FalsePart1 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart1Ptr,
+ MachinePointerInfo());
+ SDValue FalsePart2Ptr =
+ DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(8), DL);
+ SDValue FalsePart2 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart2Ptr,
+ MachinePointerInfo());
+
+ // Perform CT_SELECT on each 32-bit chunk
+ SDValue Part0Ops[] = {FalsePart0, TruePart0, CC, ProcessedCond};
+ SDValue Part0Select = DAG.getNode(X86ISD::CT_SELECT, DL, MVT::i32, Part0Ops);
+ SDValue Part1Ops[] = {FalsePart1, TruePart1, CC, ProcessedCond};
+ SDValue Part1Select = DAG.getNode(X86ISD::CT_SELECT, DL, MVT::i32, Part1Ops);
+ SDValue Part2Ops[] = {FalsePart2, TruePart2, CC, ProcessedCond};
+ SDValue Part2Select = DAG.getNode(X86ISD::CT_SELECT, DL, MVT::i32, Part2Ops);
+
+ // Create result stack slot and store the selected parts
+ SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f80);
+ SDValue StorePart0 =
+ DAG.getStore(Chain, DL, Part0Select, ResultSlot, MachinePointerInfo());
+ SDValue ResPart1Ptr =
+ DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL);
+ SDValue StorePart1 = DAG.getStore(StorePart0, DL, Part1Select, ResPart1Ptr,
+ MachinePointerInfo());
+ SDValue ResPart2Ptr =
+ DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(8), DL);
+ SDValue StorePart2 = DAG.getStore(StorePart1, DL, Part2Select, ResPart2Ptr,
+ MachinePointerInfo());
+
+ // Load complete f80 result from memory
+ return DAG.getLoad(MVT::f80, DL, StorePart2, ResultSlot,
+ MachinePointerInfo());
+ }
+
// Create final CT_SELECT node
SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
return DAG.getNode(X86ISD::CT_SELECT, DL, Op.getValueType(), Ops,
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index d4a46048a1d20..f98501da82104 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -689,8 +689,7 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
.addImm(31));
} else {
// Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax)
- recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR)
- .addReg(TmpGPR));
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR));
}
// Broadcast to TmpX (vector mask)
@@ -847,7 +846,8 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
.setMIFlags(MachineInstr::MIFlag::NoMerge));
}
- assert(FirstInstr && LastInstr && "Expected at least one expanded instruction");
+ assert(FirstInstr && LastInstr &&
+ "Expected at least one expanded instruction");
auto BundleEnd = LastInstr->getIterator();
finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd));
@@ -915,25 +915,28 @@ bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const {
/// Expand i386-specific CT_SELECT pseudo instructions (post-RA, constant-time)
/// These internal pseudos receive a pre-materialized condition byte from the
-/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization.
+/// custom inserter, avoiding EFLAGS corruption issues during i64 type
+/// legalization.
bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
MachineBasicBlock *MBB = MI.getParent();
DebugLoc DL = MI.getDebugLoc();
// CT_SELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask),
// (ins src1, src2, cond_byte)
- // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent
+ // Note: cond_byte is pre-materialized by custom inserter, not
+ // EFLAGS-dependent
Register DstReg = MI.getOperand(0).getReg();
Register TmpByteReg = MI.getOperand(1).getReg();
Register TmpMaskReg = MI.getOperand(2).getReg();
Register Src1Reg = MI.getOperand(3).getReg();
Register Src2Reg = MI.getOperand(4).getReg();
- Register CondByteReg = MI.getOperand(5).getReg(); // Pre-materialized condition byte
+ Register CondByteReg =
+ MI.getOperand(5).getReg(); // Pre-materialized condition byte
// Determine instruction opcodes based on register width
unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp;
if (MI.getOpcode() == X86::CT_SELECT_I386_INT_GR8rr) {
- MovZXOp = 0; // No zero-extend needed for GR8
+ MovZXOp = 0; // No zero-extend needed for GR8
NegOp = X86::NEG8r;
MovOp = X86::MOV8rr;
AndOp = X86::AND8rr;
@@ -962,8 +965,8 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
// Step 1: Copy pre-materialized condition byte to TmpByteReg
// This allows the bundle to work with allocated temporaries
auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg)
- .addReg(CondByteReg)
- .setMIFlag(MachineInstr::MIFlag::NoMerge);
+ .addReg(CondByteReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
auto BundleStart = I1->getIterator();
// Step 2: Zero-extend condition byte to register width (0 or 1)
@@ -974,7 +977,9 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
}
// Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...)
- Register MaskReg = (MI.getOpcode() == X86::CT_SELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg;
+ Register MaskReg = (MI.getOpcode() == X86::CT_SELECT_I386_INT_GR8rr)
+ ? TmpByteReg
+ : TmpMaskReg;
BuildMI(*MBB, MI, DL, get(NegOp), MaskReg)
.addReg(MaskReg)
.setMIFlag(MachineInstr::MIFlag::NoMerge);
@@ -1002,9 +1007,9 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
// Step 8: Final result: (src1 & mask) | (src2 & ~mask)
auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg)
- .addReg(DstReg)
- .addReg(MaskReg)
- .setMIFlag(MachineInstr::MIFlag::NoMerge);
+ .addReg(DstReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
// Bundle all generated instructions for atomic execution before removing MI
auto BundleEnd = std::next(LI->getIterator());
@@ -1013,11 +1018,12 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
finalizeBundle(*MBB, BundleStart, BundleEnd);
}
- // TODO: Optimization opportunity - The register allocator may choose callee-saved
- // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary
- // save/restore overhead. Consider constraining these to caller-saved register
- // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve
- // constant-time performance by eliminating prologue/epilogue instructions.
+ // TODO: Optimization opportunity - The register allocator may choose
+ // callee-saved registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg,
+ // causing unnecessary save/restore overhead. Consider constraining these to
+ // caller-saved register classes (e.g., GR8_AL, GR32_CallSaved) in the
+ // TableGen definitions to improve constant-time performance by eliminating
+ // prologue/epilogue instructions.
// Remove the original pseudo instruction
MI.eraseFromParent();
@@ -1305,8 +1311,7 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
return isPICBase;
}
-bool X86InstrInfo::isReMaterializableImpl(
- const MachineInstr &MI) const {
+bool X86InstrInfo::isReMaterializableImpl(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
// This function should only be called for opcodes with the ReMaterializable
@@ -1823,32 +1828,32 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
switch (MIOpc) {
default:
llvm_unreachable("Unreachable!");
- CASE_NF(SHL8ri)
- CASE_NF(SHL16ri) {
- unsigned ShAmt = MI.getOperand(2).getImm();
- MIB.addReg(0)
- .addImm(1LL << ShAmt)
- .addReg(InRegLEA, RegState::Kill)
- .addImm(0)
- .addReg(0);
- break;
- }
- CASE_NF(INC8r)
- CASE_NF(INC16r)
+ CASE_NF(SHL8ri)
+ CASE_NF(SHL16ri) {
+ unsigned ShAmt = MI.getOperand(2).getImm();
+ MIB.addReg(0)
+ .addImm(1LL << ShAmt)
+ .addReg(InRegLEA, RegState::Kill)
+ .addImm(0)
+ .addReg(0);
+ break;
+ }
+ CASE_NF(INC8r)
+ CASE_NF(INC16r)
addRegOffset(MIB, InRegLEA, true, 1);
break;
- CASE_NF(DEC8r)
- CASE_NF(DEC16r)
+ CASE_NF(DEC8r)
+ CASE_NF(DEC16r)
addRegOffset(MIB, InRegLEA, true, -1);
break;
- CASE_NF(ADD8ri)
- CASE_NF(ADD16ri)
+ CASE_NF(ADD8ri)
+ CASE_NF(ADD16ri)
case X86::ADD8ri_DB:
case X86::ADD16ri_DB:
addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
break;
- CASE_NF(ADD8rr)
- CASE_NF(ADD16rr)
+ CASE_NF(ADD8rr)
+ CASE_NF(ADD16rr)
case X86::ADD8rr_DB:
case X86::ADD16rr_DB: {
Src2 = MI.getOperand(2).getReg();
@@ -1986,128 +1991,129 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
switch (MIOpc) {
default:
llvm_unreachable("Unreachable!");
- CASE_NF(SHL64ri) {
- assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
- unsigned ShAmt = getTruncatedShiftCount(MI, 2);
- if (!isTruncatedShiftCountForLEA(ShAmt))
- return nullptr;
-
- // LEA can't handle RSP.
- if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
- Src.getReg(), &X86::GR64_NOSPRegClass))
- return nullptr;
+ CASE_NF(SHL64ri) {
+ assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt))
+ return nullptr;
- NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
- .add(Dest)
- .addReg(0)
- .addImm(1LL << ShAmt)
- .add(Src)
- .addImm(0)
- .addReg(0);
- break;
- }
- CASE_NF(SHL32ri) {
- assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
- unsigned ShAmt = getTruncatedShiftCount(MI, 2);
- if (!isTruncatedShiftCountForLEA(ShAmt))
- return nullptr;
+ // LEA can't handle RSP.
+ if (Src.getReg().isVirtual() &&
+ !MF.getRegInfo().constrainRegClass(Src.getReg(),
+ &X86::GR64_NOSPRegClass))
+ return nullptr;
- unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+ NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
+ .add(Dest)
+ .addReg(0)
+ .addImm(1LL << ShAmt)
+ .add(Src)
+ .addImm(0)
+ .addReg(0);
+ break;
+ }
+ CASE_NF(SHL32ri) {
+ assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt))
+ return nullptr;
- // LEA can't handle ESP.
- bool isKill;
- MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
- if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
- isKill, ImplicitOp, LV, LIS))
- return nullptr;
+ unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
- MachineInstrBuilder MIB =
- BuildMI(MF, MI.getDebugLoc(), get(Opc))
- .add(Dest)
- .addReg(0)
- .addImm(1LL << ShAmt)
- .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
- .addImm(0)
- .addReg(0);
- if (ImplicitOp.getReg() != 0)
- MIB.add(ImplicitOp);
- NewMI = MIB;
+ // LEA can't handle ESP.
+ bool isKill;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+ isKill, ImplicitOp, LV, LIS))
+ return nullptr;
- // Add kills if classifyLEAReg created a new register.
- if (LV && SrcReg != Src.getReg())
- LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
- break;
- }
- CASE_NF(SHL8ri)
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(0)
+ .addImm(1LL << ShAmt)
+ .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
+ .addImm(0)
+ .addReg(0);
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
+ NewMI = MIB;
+
+ // Add kills if classifyLEAReg created a new register.
+ if (LV && SrcReg != Src.getReg())
+ LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+ break;
+ }
+ CASE_NF(SHL8ri)
Is8BitOp = true;
[[fallthrough]];
- CASE_NF(SHL16ri) {
- assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
- unsigned ShAmt = getTruncatedShiftCount(MI, 2);
- if (!isTruncatedShiftCountForLEA(ShAmt))
- return nullptr;
- return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
- }
- CASE_NF(INC64r)
- CASE_NF(INC32r) {
- assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
- unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
- ? X86::LEA64r
- : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
- bool isKill;
- MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
- if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
- isKill, ImplicitOp, LV, LIS))
- return nullptr;
-
- MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
- .add(Dest)
- .addReg(SrcReg, getKillRegState(isKill));
- if (ImplicitOp.getReg() != 0)
- MIB.add(ImplicitOp);
+ CASE_NF(SHL16ri) {
+ assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt))
+ return nullptr;
+ return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
+ }
+ CASE_NF(INC64r)
+ CASE_NF(INC32r) {
+ assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
+ unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
+ ? X86::LEA64r
+ : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ bool isKill;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+ isKill, ImplicitOp, LV, LIS))
+ return nullptr;
- NewMI = addOffset(MIB, 1);
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(SrcReg, getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
- // Add kills if classifyLEAReg created a new register.
- if (LV && SrcReg != Src.getReg())
- LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
- break;
- }
- CASE_NF(DEC64r)
- CASE_NF(DEC32r) {
- assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
- unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
- ? X86::LEA64r
- : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ NewMI = addOffset(MIB, 1);
- bool isKill;
- MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
- if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
- isKill, ImplicitOp, LV, LIS))
- return nullptr;
+ // Add kills if classifyLEAReg created a new register.
+ if (LV && SrcReg != Src.getReg())
+ LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+ break;
+ }
+ CASE_NF(DEC64r)
+ CASE_NF(DEC32r) {
+ assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
+ unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
+ ? X86::LEA64r
+ : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+
+ bool isKill;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+ isKill, ImplicitOp, LV, LIS))
+ return nullptr;
- MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
- .add(Dest)
- .addReg(SrcReg, getKillRegState(isKill));
- if (ImplicitOp.getReg() != 0)
- MIB.add(ImplicitOp);
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(SrcReg, getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
- NewMI = addOffset(MIB, -1);
+ NewMI = addOffset(MIB, -1);
- // Add kills if classifyLEAReg created a new register.
- if (LV && SrcReg != Src.getReg())
- LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
- break;
- }
- CASE_NF(DEC8r)
- CASE_NF(INC8r)
+ // Add kills if classifyLEAReg created a new register.
+ if (LV && SrcReg != Src.getReg())
+ LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+ break;
+ }
+ CASE_NF(DEC8r)
+ CASE_NF(INC8r)
Is8BitOp = true;
[[fallthrough]];
- CASE_NF(DEC16r)
- CASE_NF(INC16r)
+ CASE_NF(DEC16r)
+ CASE_NF(INC16r)
return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
- CASE_NF(ADD64rr)
- CASE_NF(ADD32rr)
+ CASE_NF(ADD64rr)
+ CASE_NF(ADD32rr)
case X86::ADD64rr_DB:
case X86::ADD32rr_DB: {
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
@@ -2158,21 +2164,21 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
NumRegOperands = 3;
break;
}
- CASE_NF(ADD8rr)
+ CASE_NF(ADD8rr)
case X86::ADD8rr_DB:
Is8BitOp = true;
[[fallthrough]];
- CASE_NF(ADD16rr)
+ CASE_NF(ADD16rr)
case X86::ADD16rr_DB:
return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
- CASE_NF(ADD64ri32)
+ CASE_NF(ADD64ri32)
case X86::ADD64ri32_DB:
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
NewMI = addOffset(
BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
MI.getOperand(2));
break;
- CASE_NF(ADD32ri)
+ CASE_NF(ADD32ri)
case X86::ADD32ri_DB: {
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
@@ -2197,62 +2203,62 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
break;
}
- CASE_NF(ADD8ri)
+ CASE_NF(ADD8ri)
case X86::ADD8ri_DB:
Is8BitOp = true;
[[fallthrough]];
- CASE_NF(ADD16ri)
+ CASE_NF(ADD16ri)
case X86::ADD16ri_DB:
return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
- CASE_NF(SUB8ri)
- CASE_NF(SUB16ri)
+ CASE_NF(SUB8ri)
+ CASE_NF(SUB16ri)
/// FIXME: Support these similar to ADD8ri/ADD16ri*.
return nullptr;
- CASE_NF(SUB32ri) {
- if (!MI.getOperand(2).isImm())
- return nullptr;
- int64_t Imm = MI.getOperand(2).getImm();
- if (!isInt<32>(-Imm))
- return nullptr;
+ CASE_NF(SUB32ri) {
+ if (!MI.getOperand(2).isImm())
+ return nullptr;
+ int64_t Imm = MI.getOperand(2).getImm();
+ if (!isInt<32>(-Imm))
+ return nullptr;
- assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
- unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
- bool isKill;
- MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
- if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
- isKill, ImplicitOp, LV, LIS))
- return nullptr;
+ bool isKill;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
+ isKill, ImplicitOp, LV, LIS))
+ return nullptr;
- MachineInstrBuilder MIB =
- BuildMI(MF, MI.getDebugLoc(), get(Opc))
- .add(Dest)
- .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
- if (ImplicitOp.getReg() != 0)
- MIB.add(ImplicitOp);
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
- NewMI = addOffset(MIB, -Imm);
+ NewMI = addOffset(MIB, -Imm);
- // Add kills if classifyLEAReg created a new register.
- if (LV && SrcReg != Src.getReg())
- LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
- break;
- }
+ // Add kills if classifyLEAReg created a new register.
+ if (LV && SrcReg != Src.getReg())
+ LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+ break;
+ }
- CASE_NF(SUB64ri32) {
- if (!MI.getOperand(2).isImm())
- return nullptr;
- int64_t Imm = MI.getOperand(2).getImm();
- if (!isInt<32>(-Imm))
- return nullptr;
+ CASE_NF(SUB64ri32) {
+ if (!MI.getOperand(2).isImm())
+ return nullptr;
+ int64_t Imm = MI.getOperand(2).getImm();
+ if (!isInt<32>(-Imm))
+ return nullptr;
- assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
+ assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
- MachineInstrBuilder MIB =
- BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
- NewMI = addOffset(MIB, -Imm);
- break;
- }
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
+ NewMI = addOffset(MIB, -Imm);
+ break;
+ }
case X86::VMOVDQU8Z128rmk:
case X86::VMOVDQU8Z256rmk:
@@ -2852,17 +2858,17 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::OP##_ND:
switch (Opc) {
- // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
- CASE_ND(SHRD16rri8)
- CASE_ND(SHLD16rri8)
- CASE_ND(SHRD32rri8)
- CASE_ND(SHLD32rri8)
- CASE_ND(SHRD64rri8)
- CASE_ND(SHLD64rri8) {
- unsigned Size;
- switch (Opc) {
- default:
- llvm_unreachable("Unreachable!");
+ // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
+ CASE_ND(SHRD16rri8)
+ CASE_ND(SHLD16rri8)
+ CASE_ND(SHRD32rri8)
+ CASE_ND(SHLD32rri8)
+ CASE_ND(SHRD64rri8)
+ CASE_ND(SHLD64rri8) {
+ unsigned Size;
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unreachable!");
#define FROM_TO_SIZE(A, B, S) \
case X86::A: \
Opc = X86::B; \
@@ -2881,16 +2887,16 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
Size = S; \
break;
- FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
- FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
- FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
+ FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
+ FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
+ FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
#undef FROM_TO_SIZE
+ }
+ WorkingMI = CloneIfNew(MI);
+ WorkingMI->setDesc(get(Opc));
+ WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
+ break;
}
- WorkingMI = CloneIfNew(MI);
- WorkingMI->setDesc(get(Opc));
- WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
- break;
- }
case X86::PFSUBrr:
case X86::PFSUBRrr:
// PFSUB x, y: x = x - y
@@ -3174,15 +3180,16 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
WorkingMI = CloneIfNew(MI);
WorkingMI->setDesc(get(Opc));
break;
- CASE_ND(CMOV16rr)
- CASE_ND(CMOV32rr)
- CASE_ND(CMOV64rr) {
- WorkingMI = CloneIfNew(MI);
- unsigned OpNo = MI.getDesc().getNumOperands() - 1;
- X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
- WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
- break;
- }
+ CASE_ND(CMOV16rr)
+ CASE_ND(CMOV32rr)
+ CASE_ND(CMOV64rr) {
+ WorkingMI = CloneIfNew(MI);
+ unsigned OpNo = MI.getDesc().getNumOperands() - 1;
+ X86::CondCode CC =
+ static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
+ WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
+ break;
+ }
case X86::VPTERNLOGDZrri:
case X86::VPTERNLOGDZrmi:
case X86::VPTERNLOGDZ128rri:
@@ -5391,29 +5398,29 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
CmpMask = CmpValue = 0;
}
return true;
- // A SUB can be used to perform comparison.
- CASE_ND(SUB64rm)
- CASE_ND(SUB32rm)
- CASE_ND(SUB16rm)
- CASE_ND(SUB8rm)
+ // A SUB can be used to perform comparison.
+ CASE_ND(SUB64rm)
+ CASE_ND(SUB32rm)
+ CASE_ND(SUB16rm)
+ CASE_ND(SUB8rm)
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = 0;
CmpValue = 0;
return true;
- CASE_ND(SUB64rr)
- CASE_ND(SUB32rr)
- CASE_ND(SUB16rr)
- CASE_ND(SUB8rr)
+ CASE_ND(SUB64rr)
+ CASE_ND(SUB32rr)
+ CASE_ND(SUB16rr)
+ CASE_ND(SUB8rr)
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = MI.getOperand(2).getReg();
CmpMask = 0;
CmpValue = 0;
return true;
- CASE_ND(SUB64ri32)
- CASE_ND(SUB32ri)
- CASE_ND(SUB16ri)
- CASE_ND(SUB8ri)
+ CASE_ND(SUB64ri32)
+ CASE_ND(SUB32ri)
+ CASE_ND(SUB16ri)
+ CASE_ND(SUB8ri)
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
if (MI.getOperand(2).isImm()) {
@@ -5468,27 +5475,27 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
case X86::CMP32rr:
case X86::CMP16rr:
case X86::CMP8rr:
- CASE_ND(SUB64rr)
- CASE_ND(SUB32rr)
- CASE_ND(SUB16rr)
- CASE_ND(SUB8rr) {
- Register OISrcReg;
- Register OISrcReg2;
- int64_t OIMask;
- int64_t OIValue;
- if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
- OIMask != ImmMask || OIValue != ImmValue)
+ CASE_ND(SUB64rr)
+ CASE_ND(SUB32rr)
+ CASE_ND(SUB16rr)
+ CASE_ND(SUB8rr) {
+ Register OISrcReg;
+ Register OISrcReg2;
+ int64_t OIMask;
+ int64_t OIValue;
+ if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
+ OIMask != ImmMask || OIValue != ImmValue)
+ return false;
+ if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
+ *IsSwapped = false;
+ return true;
+ }
+ if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
+ *IsSwapped = true;
+ return true;
+ }
return false;
- if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
- *IsSwapped = false;
- return true;
}
- if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
- *IsSwapped = true;
- return true;
- }
- return false;
- }
case X86::CMP64ri32:
case X86::CMP32ri:
case X86::CMP16ri:
@@ -5497,10 +5504,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
case X86::TEST32ri:
case X86::TEST16ri:
case X86::TEST8ri:
- CASE_ND(SUB64ri32)
- CASE_ND(SUB32ri)
- CASE_ND(SUB16ri)
- CASE_ND(SUB8ri)
+ CASE_ND(SUB64ri32)
+ CASE_ND(SUB32ri)
+ CASE_ND(SUB16ri)
+ CASE_ND(SUB8ri)
case X86::TEST64rr:
case X86::TEST32rr:
case X86::TEST16rr:
@@ -5557,98 +5564,98 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
default:
return false;
- // The shift instructions only modify ZF if their shift count is non-zero.
- // N.B.: The processor truncates the shift count depending on the encoding.
- CASE_ND(SAR8ri)
- CASE_ND(SAR16ri)
- CASE_ND(SAR32ri)
- CASE_ND(SAR64ri)
- CASE_ND(SHR8ri)
- CASE_ND(SHR16ri)
- CASE_ND(SHR32ri)
- CASE_ND(SHR64ri)
+ // The shift instructions only modify ZF if their shift count is non-zero.
+ // N.B.: The processor truncates the shift count depending on the encoding.
+ CASE_ND(SAR8ri)
+ CASE_ND(SAR16ri)
+ CASE_ND(SAR32ri)
+ CASE_ND(SAR64ri)
+ CASE_ND(SHR8ri)
+ CASE_ND(SHR16ri)
+ CASE_ND(SHR32ri)
+ CASE_ND(SHR64ri)
return getTruncatedShiftCount(MI, 2) != 0;
- // Some left shift instructions can be turned into LEA instructions but only
- // if their flags aren't used. Avoid transforming such instructions.
- CASE_ND(SHL8ri)
- CASE_ND(SHL16ri)
- CASE_ND(SHL32ri)
- CASE_ND(SHL64ri) {
- unsigned ShAmt = getTruncatedShiftCount(MI, 2);
- if (isTruncatedShiftCountForLEA(ShAmt))
- return false;
- return ShAmt != 0;
- }
+ // Some left shift instructions can be turned into LEA instructions but only
+ // if their flags aren't used. Avoid transforming such instructions.
+ CASE_ND(SHL8ri)
+ CASE_ND(SHL16ri)
+ CASE_ND(SHL32ri)
+ CASE_ND(SHL64ri) {
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (isTruncatedShiftCountForLEA(ShAmt))
+ return false;
+ return ShAmt != 0;
+ }
- CASE_ND(SHRD16rri8)
- CASE_ND(SHRD32rri8)
- CASE_ND(SHRD64rri8)
- CASE_ND(SHLD16rri8)
- CASE_ND(SHLD32rri8)
- CASE_ND(SHLD64rri8)
+ CASE_ND(SHRD16rri8)
+ CASE_ND(SHRD32rri8)
+ CASE_ND(SHRD64rri8)
+ CASE_ND(SHLD16rri8)
+ CASE_ND(SHLD32rri8)
+ CASE_ND(SHLD64rri8)
return getTruncatedShiftCount(MI, 3) != 0;
- CASE_ND(SUB64ri32)
- CASE_ND(SUB32ri)
- CASE_ND(SUB16ri)
- CASE_ND(SUB8ri)
- CASE_ND(SUB64rr)
- CASE_ND(SUB32rr)
- CASE_ND(SUB16rr)
- CASE_ND(SUB8rr)
- CASE_ND(SUB64rm)
- CASE_ND(SUB32rm)
- CASE_ND(SUB16rm)
- CASE_ND(SUB8rm)
- CASE_ND(DEC64r)
- CASE_ND(DEC32r)
- CASE_ND(DEC16r)
- CASE_ND(DEC8r)
- CASE_ND(ADD64ri32)
- CASE_ND(ADD32ri)
- CASE_ND(ADD16ri)
- CASE_ND(ADD8ri)
- CASE_ND(ADD64rr)
- CASE_ND(ADD32rr)
- CASE_ND(ADD16rr)
- CASE_ND(ADD8rr)
- CASE_ND(ADD64rm)
- CASE_ND(ADD32rm)
- CASE_ND(ADD16rm)
- CASE_ND(ADD8rm)
- CASE_ND(INC64r)
- CASE_ND(INC32r)
- CASE_ND(INC16r)
- CASE_ND(INC8r)
- CASE_ND(ADC64ri32)
- CASE_ND(ADC32ri)
- CASE_ND(ADC16ri)
- CASE_ND(ADC8ri)
- CASE_ND(ADC64rr)
- CASE_ND(ADC32rr)
- CASE_ND(ADC16rr)
- CASE_ND(ADC8rr)
- CASE_ND(ADC64rm)
- CASE_ND(ADC32rm)
- CASE_ND(ADC16rm)
- CASE_ND(ADC8rm)
- CASE_ND(SBB64ri32)
- CASE_ND(SBB32ri)
- CASE_ND(SBB16ri)
- CASE_ND(SBB8ri)
- CASE_ND(SBB64rr)
- CASE_ND(SBB32rr)
- CASE_ND(SBB16rr)
- CASE_ND(SBB8rr)
- CASE_ND(SBB64rm)
- CASE_ND(SBB32rm)
- CASE_ND(SBB16rm)
- CASE_ND(SBB8rm)
- CASE_ND(NEG8r)
- CASE_ND(NEG16r)
- CASE_ND(NEG32r)
- CASE_ND(NEG64r)
+ CASE_ND(SUB64ri32)
+ CASE_ND(SUB32ri)
+ CASE_ND(SUB16ri)
+ CASE_ND(SUB8ri)
+ CASE_ND(SUB64rr)
+ CASE_ND(SUB32rr)
+ CASE_ND(SUB16rr)
+ CASE_ND(SUB8rr)
+ CASE_ND(SUB64rm)
+ CASE_ND(SUB32rm)
+ CASE_ND(SUB16rm)
+ CASE_ND(SUB8rm)
+ CASE_ND(DEC64r)
+ CASE_ND(DEC32r)
+ CASE_ND(DEC16r)
+ CASE_ND(DEC8r)
+ CASE_ND(ADD64ri32)
+ CASE_ND(ADD32ri)
+ CASE_ND(ADD16ri)
+ CASE_ND(ADD8ri)
+ CASE_ND(ADD64rr)
+ CASE_ND(ADD32rr)
+ CASE_ND(ADD16rr)
+ CASE_ND(ADD8rr)
+ CASE_ND(ADD64rm)
+ CASE_ND(ADD32rm)
+ CASE_ND(ADD16rm)
+ CASE_ND(ADD8rm)
+ CASE_ND(INC64r)
+ CASE_ND(INC32r)
+ CASE_ND(INC16r)
+ CASE_ND(INC8r)
+ CASE_ND(ADC64ri32)
+ CASE_ND(ADC32ri)
+ CASE_ND(ADC16ri)
+ CASE_ND(ADC8ri)
+ CASE_ND(ADC64rr)
+ CASE_ND(ADC32rr)
+ CASE_ND(ADC16rr)
+ CASE_ND(ADC8rr)
+ CASE_ND(ADC64rm)
+ CASE_ND(ADC32rm)
+ CASE_ND(ADC16rm)
+ CASE_ND(ADC8rm)
+ CASE_ND(SBB64ri32)
+ CASE_ND(SBB32ri)
+ CASE_ND(SBB16ri)
+ CASE_ND(SBB8ri)
+ CASE_ND(SBB64rr)
+ CASE_ND(SBB32rr)
+ CASE_ND(SBB16rr)
+ CASE_ND(SBB8rr)
+ CASE_ND(SBB64rm)
+ CASE_ND(SBB32rm)
+ CASE_ND(SBB16rm)
+ CASE_ND(SBB8rm)
+ CASE_ND(NEG8r)
+ CASE_ND(NEG16r)
+ CASE_ND(NEG32r)
+ CASE_ND(NEG64r)
case X86::LZCNT16rr:
case X86::LZCNT16rm:
case X86::LZCNT32rr:
@@ -5668,42 +5675,42 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
case X86::TZCNT64rr:
case X86::TZCNT64rm:
return true;
- CASE_ND(AND64ri32)
- CASE_ND(AND32ri)
- CASE_ND(AND16ri)
- CASE_ND(AND8ri)
- CASE_ND(AND64rr)
- CASE_ND(AND32rr)
- CASE_ND(AND16rr)
- CASE_ND(AND8rr)
- CASE_ND(AND64rm)
- CASE_ND(AND32rm)
- CASE_ND(AND16rm)
- CASE_ND(AND8rm)
- CASE_ND(XOR64ri32)
- CASE_ND(XOR32ri)
- CASE_ND(XOR16ri)
- CASE_ND(XOR8ri)
- CASE_ND(XOR64rr)
- CASE_ND(XOR32rr)
- CASE_ND(XOR16rr)
- CASE_ND(XOR8rr)
- CASE_ND(XOR64rm)
- CASE_ND(XOR32rm)
- CASE_ND(XOR16rm)
- CASE_ND(XOR8rm)
- CASE_ND(OR64ri32)
- CASE_ND(OR32ri)
- CASE_ND(OR16ri)
- CASE_ND(OR8ri)
- CASE_ND(OR64rr)
- CASE_ND(OR32rr)
- CASE_ND(OR16rr)
- CASE_ND(OR8rr)
- CASE_ND(OR64rm)
- CASE_ND(OR32rm)
- CASE_ND(OR16rm)
- CASE_ND(OR8rm)
+ CASE_ND(AND64ri32)
+ CASE_ND(AND32ri)
+ CASE_ND(AND16ri)
+ CASE_ND(AND8ri)
+ CASE_ND(AND64rr)
+ CASE_ND(AND32rr)
+ CASE_ND(AND16rr)
+ CASE_ND(AND8rr)
+ CASE_ND(AND64rm)
+ CASE_ND(AND32rm)
+ CASE_ND(AND16rm)
+ CASE_ND(AND8rm)
+ CASE_ND(XOR64ri32)
+ CASE_ND(XOR32ri)
+ CASE_ND(XOR16ri)
+ CASE_ND(XOR8ri)
+ CASE_ND(XOR64rr)
+ CASE_ND(XOR32rr)
+ CASE_ND(XOR16rr)
+ CASE_ND(XOR8rr)
+ CASE_ND(XOR64rm)
+ CASE_ND(XOR32rm)
+ CASE_ND(XOR16rm)
+ CASE_ND(XOR8rm)
+ CASE_ND(OR64ri32)
+ CASE_ND(OR32ri)
+ CASE_ND(OR16ri)
+ CASE_ND(OR8ri)
+ CASE_ND(OR64rr)
+ CASE_ND(OR32rr)
+ CASE_ND(OR16rr)
+ CASE_ND(OR8rr)
+ CASE_ND(OR64rm)
+ CASE_ND(OR32rm)
+ CASE_ND(OR16rm)
+ CASE_ND(OR8rm)
case X86::ANDN32rr:
case X86::ANDN32rm:
case X86::ANDN64rr:
@@ -5781,15 +5788,17 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
}
/// Check whether the use can be converted to remove a comparison against zero.
-/// Returns the EFLAGS condition and the operand that we are comparing against zero.
-static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
+/// Returns the EFLAGS condition and the operand that we are comparing against
+/// zero.
+static std::pair<X86::CondCode, unsigned>
+isUseDefConvertible(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
return std::make_pair(X86::COND_INVALID, ~0U);
- CASE_ND(NEG8r)
- CASE_ND(NEG16r)
- CASE_ND(NEG32r)
- CASE_ND(NEG64r)
+ CASE_ND(NEG8r)
+ CASE_ND(NEG16r)
+ CASE_ND(NEG32r)
+ CASE_ND(NEG64r)
return std::make_pair(X86::COND_AE, 1U);
case X86::LZCNT16rr:
case X86::LZCNT32rr:
@@ -5833,51 +5842,53 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
switch (CmpInstr.getOpcode()) {
default:
break;
- CASE_ND(SUB64ri32)
- CASE_ND(SUB32ri)
- CASE_ND(SUB16ri)
- CASE_ND(SUB8ri)
- CASE_ND(SUB64rm)
- CASE_ND(SUB32rm)
- CASE_ND(SUB16rm)
- CASE_ND(SUB8rm)
- CASE_ND(SUB64rr)
- CASE_ND(SUB32rr)
- CASE_ND(SUB16rr)
- CASE_ND(SUB8rr) {
- if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
- return false;
- // There is no use of the destination register, we can replace SUB with CMP.
- unsigned NewOpcode = 0;
+ CASE_ND(SUB64ri32)
+ CASE_ND(SUB32ri)
+ CASE_ND(SUB16ri)
+ CASE_ND(SUB8ri)
+ CASE_ND(SUB64rm)
+ CASE_ND(SUB32rm)
+ CASE_ND(SUB16rm)
+ CASE_ND(SUB8rm)
+ CASE_ND(SUB64rr)
+ CASE_ND(SUB32rr)
+ CASE_ND(SUB16rr)
+ CASE_ND(SUB8rr) {
+ if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
+ return false;
+ // There is no use of the destination register, we can replace SUB with
+ // CMP.
+ unsigned NewOpcode = 0;
#define FROM_TO(A, B) \
CASE_ND(A) NewOpcode = X86::B; \
break;
- switch (CmpInstr.getOpcode()) {
- default:
- llvm_unreachable("Unreachable!");
- FROM_TO(SUB64rm, CMP64rm)
- FROM_TO(SUB32rm, CMP32rm)
- FROM_TO(SUB16rm, CMP16rm)
- FROM_TO(SUB8rm, CMP8rm)
- FROM_TO(SUB64rr, CMP64rr)
- FROM_TO(SUB32rr, CMP32rr)
- FROM_TO(SUB16rr, CMP16rr)
- FROM_TO(SUB8rr, CMP8rr)
- FROM_TO(SUB64ri32, CMP64ri32)
- FROM_TO(SUB32ri, CMP32ri)
- FROM_TO(SUB16ri, CMP16ri)
- FROM_TO(SUB8ri, CMP8ri)
- }
+ switch (CmpInstr.getOpcode()) {
+ default:
+ llvm_unreachable("Unreachable!");
+ FROM_TO(SUB64rm, CMP64rm)
+ FROM_TO(SUB32rm, CMP32rm)
+ FROM_TO(SUB16rm, CMP16rm)
+ FROM_TO(SUB8rm, CMP8rm)
+ FROM_TO(SUB64rr, CMP64rr)
+ FROM_TO(SUB32rr, CMP32rr)
+ FROM_TO(SUB16rr, CMP16rr)
+ FROM_TO(SUB8rr, CMP8rr)
+ FROM_TO(SUB64ri32, CMP64ri32)
+ FROM_TO(SUB32ri, CMP32ri)
+ FROM_TO(SUB16ri, CMP16ri)
+ FROM_TO(SUB8ri, CMP8ri)
+ }
#undef FROM_TO
- CmpInstr.setDesc(get(NewOpcode));
- CmpInstr.removeOperand(0);
- // Mutating this instruction invalidates any debug data associated with it.
- CmpInstr.dropDebugNumber();
- // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
- if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
- NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
- return false;
- }
+ CmpInstr.setDesc(get(NewOpcode));
+ CmpInstr.removeOperand(0);
+ // Mutating this instruction invalidates any debug data associated with
+ // it.
+ CmpInstr.dropDebugNumber();
+ // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+ if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+ NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+ return false;
+ }
}
// The following code tries to remove the comparison by re-using EFLAGS
@@ -6234,14 +6245,14 @@ static bool canConvert2Copy(unsigned Opc) {
switch (Opc) {
default:
return false;
- CASE_ND(ADD64ri32)
- CASE_ND(SUB64ri32)
- CASE_ND(OR64ri32)
- CASE_ND(XOR64ri32)
- CASE_ND(ADD32ri)
- CASE_ND(SUB32ri)
- CASE_ND(OR32ri)
- CASE_ND(XOR32ri)
+ CASE_ND(ADD64ri32)
+ CASE_ND(SUB64ri32)
+ CASE_ND(OR64ri32)
+ CASE_ND(XOR64ri32)
+ CASE_ND(ADD32ri)
+ CASE_ND(SUB32ri)
+ CASE_ND(OR32ri)
+ CASE_ND(XOR32ri)
return true;
}
}
@@ -9656,7 +9667,7 @@ Register X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
static const uint16_t *lookup(unsigned opcode, unsigned domain,
ArrayRef<uint16_t[3]> Table) {
- for (const uint16_t(&Row)[3] : Table)
+ for (const uint16_t (&Row)[3] : Table)
if (Row[domain - 1] == opcode)
return Row;
return nullptr;
@@ -9665,7 +9676,7 @@ static const uint16_t *lookup(unsigned opcode, unsigned domain,
static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
ArrayRef<uint16_t[4]> Table) {
// If this is the integer domain make sure to check both integer columns.
- for (const uint16_t(&Row)[4] : Table)
+ for (const uint16_t (&Row)[4] : Table)
if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
return Row;
return nullptr;
@@ -10421,25 +10432,25 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
if (Invert)
return false;
switch (Inst.getOpcode()) {
- CASE_ND(ADD8rr)
- CASE_ND(ADD16rr)
- CASE_ND(ADD32rr)
- CASE_ND(ADD64rr)
- CASE_ND(AND8rr)
- CASE_ND(AND16rr)
- CASE_ND(AND32rr)
- CASE_ND(AND64rr)
- CASE_ND(OR8rr)
- CASE_ND(OR16rr)
- CASE_ND(OR32rr)
- CASE_ND(OR64rr)
- CASE_ND(XOR8rr)
- CASE_ND(XOR16rr)
- CASE_ND(XOR32rr)
- CASE_ND(XOR64rr)
- CASE_ND(IMUL16rr)
- CASE_ND(IMUL32rr)
- CASE_ND(IMUL64rr)
+ CASE_ND(ADD8rr)
+ CASE_ND(ADD16rr)
+ CASE_ND(ADD32rr)
+ CASE_ND(ADD64rr)
+ CASE_ND(AND8rr)
+ CASE_ND(AND16rr)
+ CASE_ND(AND32rr)
+ CASE_ND(AND64rr)
+ CASE_ND(OR8rr)
+ CASE_ND(OR16rr)
+ CASE_ND(OR32rr)
+ CASE_ND(OR64rr)
+ CASE_ND(XOR8rr)
+ CASE_ND(XOR16rr)
+ CASE_ND(XOR32rr)
+ CASE_ND(XOR64rr)
+ CASE_ND(IMUL16rr)
+ CASE_ND(IMUL32rr)
+ CASE_ND(IMUL64rr)
case X86::PANDrr:
case X86::PORrr:
case X86::PXORrr:
@@ -11263,8 +11274,8 @@ bool X86InstrInfo::getMachineCombinerPatterns(
break;
}
}
- return TargetInstrInfo::getMachineCombinerPatterns(Root,
- Patterns, DoRegPressureReduce);
+ return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+ DoRegPressureReduce);
}
static void
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 76f18803c2e3d..846bcc85b7ad6 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -319,8 +319,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
Register isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
- Register isLoadFromStackSlot(const MachineInstr &MI,
- int &FrameIndex,
+ Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex,
TypeSize &MemBytes) const override;
/// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
/// stack locations as well. This uses a heuristic so it isn't
@@ -330,8 +329,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
Register isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
- Register isStoreToStackSlot(const MachineInstr &MI,
- int &FrameIndex,
+ Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex,
TypeSize &MemBytes) const override;
/// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
/// stack locations as well. This uses a heuristic so it isn't
@@ -491,12 +489,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
/// is likely that the referenced instruction has been changed.
///
/// \returns true on success.
- MachineInstr *
- foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
- ArrayRef<unsigned> Ops,
- MachineBasicBlock::iterator InsertPt, int FrameIndex,
- LiveIntervals *LIS = nullptr,
- VirtRegMap *VRM = nullptr) const override;
+ MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+ ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt,
+ int FrameIndex,
+ LiveIntervals *LIS = nullptr,
+ VirtRegMap *VRM = nullptr) const override;
/// Same as the previous version except it allows folding of any load and
/// store from / to any address, not just from a specific stack slot.
@@ -745,8 +743,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
///
/// If IsIntrinsic is set, operand 1 will be ignored for commuting.
bool findThreeSrcCommutedOpIndices(const MachineInstr &MI,
- unsigned &SrcOpIdx1,
- unsigned &SrcOpIdx2,
+ unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
bool IsIntrinsic = false) const;
/// Returns true when instruction \p FlagI produces the same flags as \p OI.
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
index b88ec72a37925..4b5f31bad8313 100644
--- a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
+++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
@@ -209,94 +209,84 @@ define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind
define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
; I386-NOCMOV-LABEL: test_ctselect_f80_basic:
; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: pushl %ebx
; I386-NOCMOV-NEXT: pushl %edi
; I386-NOCMOV-NEXT: pushl %esi
-; I386-NOCMOV-NEXT: subl $12, %esp
-; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: sete %al
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: subl $40, %esp
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: testb $1, %cl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
-; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
-; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, (%esp)
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: sete %ch
+; I386-NOCMOV-NEXT: movb %ch, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edi, %ebx
+; I386-NOCMOV-NEXT: andl %ebp, %ebx
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %edx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %ebx
+; I386-NOCMOV-NEXT: testb $1, %cl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
-; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
-; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: sete %ch
+; I386-NOCMOV-NEXT: movb %ch, %cl
+; I386-NOCMOV-NEXT: movzbl %cl, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edx, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %eax, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl (%esp), %ebx # 4-byte Reload
+; I386-NOCMOV-NEXT: movb %al, %dl
+; I386-NOCMOV-NEXT: movzbl %dl, %edi
; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, %ecx
+; I386-NOCMOV-NEXT: andl %edi, %ecx
; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: fldt (%esp)
-; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: andl %ebx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: addl $40, %esp
; I386-NOCMOV-NEXT: popl %esi
; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: popl %ebp
; I386-NOCMOV-NEXT: retl
;
; I386-CMOV-LABEL: test_ctselect_f80_basic:
; I386-CMOV: # %bb.0:
-; I386-CMOV-NEXT: pushl %edi
-; I386-CMOV-NEXT: pushl %esi
-; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: subl $36, %esp
+; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, (%esp)
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, (%esp)
; I386-CMOV-NEXT: fldt (%esp)
-; I386-CMOV-NEXT: addl $12, %esp
-; I386-CMOV-NEXT: popl %esi
-; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: addl $36, %esp
; I386-CMOV-NEXT: retl
%result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
ret x86_fp80 %result
@@ -543,94 +533,84 @@ define float @test_ctselect_f32_nan(i1 %cond) nounwind {
define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
; I386-NOCMOV-LABEL: test_ctselect_f80_alignment:
; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: pushl %ebx
; I386-NOCMOV-NEXT: pushl %edi
; I386-NOCMOV-NEXT: pushl %esi
-; I386-NOCMOV-NEXT: subl $12, %esp
-; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: sete %al
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: subl $40, %esp
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: testb $1, %cl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
-; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
-; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, (%esp)
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: sete %ch
+; I386-NOCMOV-NEXT: movb %ch, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edi, %ebx
+; I386-NOCMOV-NEXT: andl %ebp, %ebx
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %edx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %ebx
+; I386-NOCMOV-NEXT: testb $1, %cl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
-; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
-; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: sete %ch
+; I386-NOCMOV-NEXT: movb %ch, %cl
+; I386-NOCMOV-NEXT: movzbl %cl, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edx, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %eax, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl (%esp), %ebx # 4-byte Reload
+; I386-NOCMOV-NEXT: movb %al, %dl
+; I386-NOCMOV-NEXT: movzbl %dl, %edi
; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, %ecx
+; I386-NOCMOV-NEXT: andl %edi, %ecx
; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: fldt (%esp)
-; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: andl %ebx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: addl $40, %esp
; I386-NOCMOV-NEXT: popl %esi
; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: popl %ebp
; I386-NOCMOV-NEXT: retl
;
; I386-CMOV-LABEL: test_ctselect_f80_alignment:
; I386-CMOV: # %bb.0:
-; I386-CMOV-NEXT: pushl %edi
-; I386-CMOV-NEXT: pushl %esi
-; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: subl $36, %esp
+; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, (%esp)
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, (%esp)
; I386-CMOV-NEXT: fldt (%esp)
-; I386-CMOV-NEXT: addl $12, %esp
-; I386-CMOV-NEXT: popl %esi
-; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: addl $36, %esp
; I386-CMOV-NEXT: retl
%result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
ret x86_fp80 %result
More information about the llvm-branch-commits
mailing list