[llvm-branch-commits] [llvm] [ConstantTime] Native ct.select support for X86 and i386 (PR #166704)
Julius Alexandre via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Nov 6 10:27:04 PST 2025
https://github.com/wizardengineer updated https://github.com/llvm/llvm-project/pull/166704
>From cee41562976955a1e4c7b911a304b989a73be16d Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 17:09:23 -0500
Subject: [PATCH 1/2] [LLVM][X86] Add native ct.select support for X86 and i386
Add native X86 implementation with CMOV instructions and comprehensive tests:
- X86 ISelLowering with CMOV for x86_64 and i386
- Fallback bitwise operations for i386 targets without CMOV
- Post-RA expansion for pseudo-instructions
- Comprehensive test coverage:
- Edge cases (zero conditions, large integers)
- i386-specific tests (FP, MMX, non-CMOV fallback)
- Vector operations
- Optimization patterns
The basic test demonstrating fallback is in the core infrastructure PR.
---
llvm/lib/Target/X86/X86.td | 8 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 791 +++++++++-
llvm/lib/Target/X86/X86ISelLowering.h | 7 +
llvm/lib/Target/X86/X86InstrCMovSetCC.td | 205 +++
llvm/lib/Target/X86/X86InstrCompiler.td | 81 ++
llvm/lib/Target/X86/X86InstrFragments.td | 5 +
llvm/lib/Target/X86/X86InstrInfo.cpp | 609 +++++++-
llvm/lib/Target/X86/X86InstrInfo.h | 6 +
llvm/lib/Target/X86/X86InstrPredicates.td | 5 +
llvm/lib/Target/X86/X86TargetMachine.cpp | 5 +-
llvm/test/CodeGen/X86/ctselect-edge-cases.ll | 409 ++++++
llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 722 ++++++++++
llvm/test/CodeGen/X86/ctselect-i386-mmx.ll | 428 ++++++
llvm/test/CodeGen/X86/ctselect-i386.ll | 267 ++++
.../test/CodeGen/X86/ctselect-optimization.ll | 304 ++++
llvm/test/CodeGen/X86/ctselect-vector.ll | 1274 +++++++++++++++++
llvm/test/CodeGen/X86/ctselect.ll | 996 +++++++------
17 files changed, 5671 insertions(+), 451 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/ctselect-edge-cases.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-fp.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-i386.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-optimization.ll
create mode 100644 llvm/test/CodeGen/X86/ctselect-vector.ll
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 9e291a6ae431f..21826d8289bb9 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -825,9 +825,10 @@ include "X86SchedSapphireRapids.td"
def ProcessorFeatures {
// x86-64 micro-architecture levels: x86-64 and x86-64-v[234]
- list<SubtargetFeature> X86_64V1Features = [
- FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2,
- FeatureFXSR, FeatureNOPL, FeatureX86_64,
+ list<SubtargetFeature> X86_64V1Features = [FeatureX87, FeatureCX8,
+ FeatureCMOV, FeatureMMX,
+ FeatureSSE2, FeatureFXSR,
+ FeatureNOPL, FeatureX86_64,
];
list<SubtargetFeature> X86_64V1Tuning = [
TuningMacroFusion,
@@ -1161,6 +1162,7 @@ def ProcessorFeatures {
FeatureAVXNECONVERT,
FeatureAVXVNNIINT8,
FeatureAVXVNNIINT16,
+ FeatureUSERMSR,
FeatureSHA512,
FeatureSM3,
FeatureEGPR,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6edf0185df813..833afa717c32c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "X86ISelLowering.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
#include "MCTargetDesc/X86ShuffleDecode.h"
#include "X86.h"
#include "X86FrameLowering.h"
@@ -29,6 +30,8 @@
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -48,6 +51,7 @@
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/PatternMatch.h"
@@ -488,6 +492,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// X86 wants to expand cmov itself.
for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
@@ -496,11 +501,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
}
// Custom action for SELECT MMX and expand action for SELECT_CC MMX
setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::x86mmx, Custom);
setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
@@ -630,6 +637,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::BR_CC, VT, Action);
setOperationAction(ISD::SETCC, VT, Action);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::SELECT_CC, VT, Action);
setOperationAction(ISD::FROUND, VT, Action);
setOperationAction(ISD::FROUNDEVEN, VT, Action);
@@ -1067,6 +1075,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom);
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
@@ -1220,6 +1229,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v2i64, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v4i32, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v8i16, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v16i8, Custom);
+
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
@@ -1541,6 +1557,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v4f64, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v4i64, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v8i32, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v16i16, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v16f16, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v32i8, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v8f32, Custom);
+
for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
@@ -1727,6 +1751,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
+ setOperationAction(ISD::CTSELECT, MVT::v1i1, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
@@ -1772,6 +1797,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
@@ -2038,6 +2064,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -2203,6 +2230,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
@@ -2269,6 +2297,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, VT, Legal);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
@@ -2538,6 +2567,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::x86amx, &X86::TILERegClass);
}
+ // Handle 512-bit vector CTSELECT without AVX512 by setting them to Expand
+ // This allows type legalization to split them into smaller vectors
+ for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16,
+ MVT::v16f32, MVT::v8f64}) {
+ setOperationAction(ISD::CTSELECT, VT, Expand);
+ }
+
+ // Handle 256-bit vector CTSELECT without AVX by setting them to Expand
+ // This allows type legalization to split them into 128-bit vectors
+ if (!Subtarget.hasAVX()) {
+ for (auto VT : {MVT::v4f64, MVT::v4i64, MVT::v8i32, MVT::v16i16,
+ MVT::v16f16, MVT::v32i8, MVT::v8f32}) {
+ setOperationAction(ISD::CTSELECT, VT, Expand);
+ }
+ }
+
// We want to custom lower some of our intrinsics.
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -2644,6 +2689,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
ISD::BITCAST,
ISD::VSELECT,
ISD::SELECT,
+ ISD::CTSELECT,
ISD::SHL,
ISD::SRA,
ISD::SRL,
@@ -25325,6 +25371,174 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,
return V;
}
+SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
+ SDValue Cond = Op.getOperand(0); // condition
+ SDValue TrueOp = Op.getOperand(1); // true_value
+ SDValue FalseOp = Op.getOperand(2); // false_value
+ SDLoc DL(Op);
+ MVT VT = TrueOp.getSimpleValueType();
+
+ // Special handling for i386 targets (no CMOV) - route to post-RA expansion
+ // pseudos Let standard type legalization handle i64 automatically (splits
+ // into EDX:EAX)
+
+ // Handle soft float16 by converting to integer operations
+ if (isSoftF16(VT, Subtarget)) {
+ MVT NVT = VT.changeTypeToInteger();
+ SDValue CtSelect =
+ DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, DAG.getBitcast(NVT, FalseOp),
+ DAG.getBitcast(NVT, TrueOp));
+ return DAG.getBitcast(VT, CtSelect);
+ }
+
+ // Handle vector types
+ if (VT.isVector()) {
+ // Handle soft float16 vectors
+ if (isSoftF16(VT, Subtarget)) {
+ MVT NVT = VT.changeVectorElementTypeToInteger();
+ SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, NVT, Cond,
+ DAG.getBitcast(NVT, FalseOp),
+ DAG.getBitcast(NVT, TrueOp));
+ return DAG.getBitcast(VT, CtSelect);
+ }
+
+ unsigned VectorWidth = VT.getSizeInBits();
+ MVT EltVT = VT.getVectorElementType();
+
+ // 512-bit vectors without AVX512 are now handled by type legalization
+ // (Expand action) 256-bit vectors without AVX are now handled by type
+ // legalization (Expand action)
+
+ if (VectorWidth == 128 && !Subtarget.hasSSE1())
+ return SDValue();
+
+ // Handle special cases for floating point vectors
+ if (EltVT.isFloatingPoint()) {
+ // For vector floating point with AVX, use VBLENDV-style operations
+ if (Subtarget.hasAVX() && (VectorWidth == 256 || VectorWidth == 128)) {
+ // Convert to bitwise operations using the condition
+ MVT IntVT = VT.changeVectorElementTypeToInteger();
+ SDValue IntOp1 = DAG.getBitcast(IntVT, TrueOp);
+ SDValue IntOp2 = DAG.getBitcast(IntVT, FalseOp);
+
+ // Create the CTSELECT node with integer types
+ SDValue IntResult =
+ DAG.getNode(X86ISD::CTSELECT, DL, IntVT, IntOp2, IntOp1,
+ DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8),
+ EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget));
+ return DAG.getBitcast(VT, IntResult);
+ }
+ }
+
+ // For integer vectors or when we don't have advanced SIMD support,
+ // use the generic X86 CTSELECT node which will be matched by the patterns
+ SDValue CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+ SDValue EFLAGS = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+ // Create the X86 CTSELECT node - note operand order: true, false, cc, flags
+ return DAG.getNode(X86ISD::CTSELECT, DL, VT, FalseOp, TrueOp, CC, EFLAGS);
+ }
+
+ // Look past (and (setcc_carry (cmp ...)), 1)
+ if (Cond.getOpcode() == ISD::AND &&
+ Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+ isOneConstant(Cond.getOperand(1)))
+ Cond = Cond.getOperand(0);
+
+ /// Process condition flags and prepare for CTSELECT node creation
+ auto ProcessConditionFlags =
+ [&](SDValue Cond, MVT VT, SDLoc DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) -> std::pair<SDValue, SDValue> {
+ SDValue CC;
+ bool AddTest = true;
+
+ unsigned CondOpcode = Cond.getOpcode();
+ if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) {
+ CC = Cond.getOperand(0);
+ SDValue Cmp = Cond.getOperand(1);
+
+ if ((isX86LogicalCmp(Cmp)) || Cmp.getOpcode() == X86ISD::BT) {
+ Cond = Cmp;
+ AddTest = false;
+ }
+ } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+ CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+ CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
+ SDValue Value;
+ X86::CondCode X86Cond;
+ std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
+ CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
+ AddTest = false;
+ }
+
+ if (AddTest) {
+ // Look past the truncate if the high bits are known zero
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ // Try to match AND to BT instruction
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ X86::CondCode X86CondCode;
+ if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
+ CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
+ Cond = BT;
+ AddTest = false;
+ }
+ }
+ }
+
+ if (AddTest) {
+ CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+ Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+ }
+
+ return {CC, Cond};
+ };
+
+ // Process condition flags and prepare for CTSELECT
+ auto [CC, ProcessedCond] =
+ ProcessConditionFlags(Cond, VT, DL, DAG, Subtarget);
+
+ // Handle i8 CTSELECT with truncate optimization
+ if (Op.getValueType() == MVT::i8 && TrueOp.getOpcode() == ISD::TRUNCATE &&
+ FalseOp.getOpcode() == ISD::TRUNCATE) {
+ SDValue T1 = TrueOp.getOperand(0), T2 = FalseOp.getOperand(0);
+ if (T1.getValueType() == T2.getValueType() &&
+ T1.getOpcode() != ISD::CopyFromReg &&
+ T2.getOpcode() != ISD::CopyFromReg) {
+ SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(),
+ T2, T1, CC, ProcessedCond);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect);
+ }
+ }
+
+ // Promote small integer types to avoid partial register stalls
+ // Exception: For i8 without CMOV, we can generate a shorter instruction
+ // sequence without movzx so keep it as is.
+ if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMOV()) ||
+ (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(TrueOp, Subtarget) &&
+ !X86::mayFoldLoad(FalseOp, Subtarget))) {
+ TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp);
+ FalseOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseOp);
+ SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+ SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect);
+ }
+
+ if (isScalarFPTypeInSSEReg(VT)) {
+ MVT IntVT = (VT == MVT::f32) ? MVT::i32 : MVT::i64;
+ TrueOp = DAG.getBitcast(IntVT, TrueOp);
+ FalseOp = DAG.getBitcast(IntVT, FalseOp);
+ SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+ SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, IntVT, Ops);
+ return DAG.getBitcast(VT, CtSelect);
+ }
+
+ // Create final CTSELECT node
+ SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+ return DAG.getNode(X86ISD::CTSELECT, DL, Op.getValueType(), Ops,
+ Op->getFlags());
+}
+
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue In = Op->getOperand(0);
@@ -29695,30 +29909,65 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
const X86Subtarget &Subtarget,
SelectionDAG &DAG,
SDValue *Low = nullptr) {
+ unsigned NumElts = VT.getVectorNumElements();
+
// For vXi8 we will unpack the low and high half of each 128 bit lane to widen
// to a vXi16 type. Do the multiplies, shift the results and pack the half
// lane results back together.
// We'll take different approaches for signed and unsigned.
- // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
- // words and use pmullw to calculate the full 16-bit product.
+ // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
+ // and use pmullw to calculate the full 16-bit product.
// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
// shift them left into the upper byte of each word. This allows us to use
// pmulhw to calculate the full 16-bit product. This trick means we don't
// need to sign extend the bytes to use pmullw.
- MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+
+ MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
SDValue Zero = DAG.getConstant(0, dl, VT);
- SDValue ALo, AHi, BLo, BHi;
+ SDValue ALo, AHi;
if (IsSigned) {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
- BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
- BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
} else {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
- BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
+ }
+
+ SDValue BLo, BHi;
+ if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
+ // If the RHS is a constant, manually unpackl/unpackh and extend.
+ SmallVector<SDValue, 16> LoOps, HiOps;
+ for (unsigned i = 0; i != NumElts; i += 16) {
+ for (unsigned j = 0; j != 8; ++j) {
+ SDValue LoOp = B.getOperand(i + j);
+ SDValue HiOp = B.getOperand(i + j + 8);
+
+ if (IsSigned) {
+ LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
+ HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
+ LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
+ DAG.getConstant(8, dl, MVT::i16));
+ HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
+ DAG.getConstant(8, dl, MVT::i16));
+ } else {
+ LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
+ HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
+ }
+
+ LoOps.push_back(LoOp);
+ HiOps.push_back(HiOp);
+ }
+ }
+
+ BLo = DAG.getBuildVector(ExVT, dl, LoOps);
+ BHi = DAG.getBuildVector(ExVT, dl, HiOps);
+ } else if (IsSigned) {
+ BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
+ BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
+ } else {
+ BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
}
@@ -29731,7 +29980,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
if (Low)
*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
- return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
+ return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
}
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
@@ -33594,6 +33843,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
+ case ISD::CTSELECT: return LowerCTSELECT(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG);
@@ -33677,6 +33927,12 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
}
}
+bool X86TargetLowering::isSelectSupported(SelectSupportKind Kind) const {
+ if (Kind == SelectSupportKind::CtSelect) {
+ return true;
+ }
+ return TargetLoweringBase::isSelectSupported(Kind);
+}
/// Replace a node with an illegal result type with a new node built out of
/// custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
@@ -34904,6 +35160,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(STRICT_CMPM)
NODE_NAME_CASE(CMPMM_SAE)
NODE_NAME_CASE(SETCC)
+ NODE_NAME_CASE(CTSELECT)
NODE_NAME_CASE(SETCC_CARRY)
NODE_NAME_CASE(FSETCC)
NODE_NAME_CASE(FSETCCM)
@@ -37677,6 +37934,480 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
return BB;
}
+/// Helper function to emit i386 CTSELECT with condition materialization.
+/// This converts EFLAGS-based CTSELECT into a condition byte that can be
+/// shared across multiple operations (critical for i64 type legalization).
+///
+/// Phase 1: Materialize condition byte from EFLAGS using SETCC
+/// Phase 2: Create internal pseudo with condition byte for post-RA expansion
+///
+/// This approach ensures that when i64 is type-legalized into two i32
+/// operations, both operations share the same condition byte rather than
+/// each independently reading (and destroying) EFLAGS.
+static MachineBasicBlock *
+emitCTSelectI386WithConditionMaterialization(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned InternalPseudoOpcode) {
+ const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+ const MIMetadata MIMD(MI);
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Original pseudo operands: (outs dst), (ins src1, src2, cond)
+ Register Src1Reg = MI.getOperand(1).getReg();
+ Register Src2Reg = MI.getOperand(2).getReg();
+ X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(3).getImm());
+
+ // Get opposite condition (SETCC sets to 1 when condition is TRUE,
+ // but we want to select src1 when condition is FALSE for X86 semantics)
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+ // Step 1: Materialize condition byte from EFLAGS
+ // This is done OUTSIDE the constant-time bundle, before any EFLAGS corruption
+ Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC);
+
+ // Step 2: Create internal pseudo that takes condition byte as input
+ // This pseudo will be expanded post-RA into the actual constant-time bundle
+ // The condition byte can now be safely shared between multiple pseudos
+
+ // Internal pseudo has operands: (outs dst, tmp_byte, tmp_mask), (ins src1,
+ // src2, cond_byte)
+ Register DstReg = MI.getOperand(0).getReg();
+
+ // Create virtual registers for the temporary outputs
+ Register TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ Register TmpMaskReg;
+
+ // Determine the register class for tmp_mask based on the data type
+ if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR8rr) {
+ TmpMaskReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR16rr) {
+ TmpMaskReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+ } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR32rr) {
+ TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ } else {
+ llvm_unreachable("Unknown internal pseudo opcode");
+ }
+
+ BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode))
+ .addDef(DstReg) // dst (output)
+ .addDef(TmpByteReg) // tmp_byte (output)
+ .addDef(TmpMaskReg) // tmp_mask (output)
+ .addReg(Src1Reg) // src1 (input)
+ .addReg(Src2Reg) // src2 (input)
+ .addReg(CondByteReg); // pre-materialized condition byte (input)
+
+ MI.eraseFromParent();
+ return BB;
+}
+
+// Helper structure to hold memory operand information for FP loads
+struct FPLoadMemOperands {
+ bool IsValid = false;
+ unsigned BaseReg = 0;
+ int64_t ScaleVal = 1;
+ unsigned IndexReg = 0;
+ int64_t Disp = 0;
+ unsigned SegReg = 0;
+ int FrameIndex = -1;
+ bool IsFrameIndex = false;
+ int ConstantPoolIndex = -1;
+ bool IsConstantPool = false;
+ const GlobalValue *Global = nullptr;
+ int64_t GlobalOffset = 0;
+ bool IsGlobal = false;
+};
+
+// Check if a virtual register is defined by a simple FP load instruction
+// Returns the memory operands if it's a simple load, otherwise returns invalid
+static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
+ MachineRegisterInfo &MRI,
+ unsigned ExpectedLoadOpcode) {
+ FPLoadMemOperands Result;
+
+ if (!Reg.isVirtual())
+ return Result;
+
+ MachineInstr *DefMI = MRI.getVRegDef(Reg);
+ if (!DefMI)
+ return Result;
+
+ // Check if it's the expected load opcode (e.g., LD_Fp32m, LD_Fp64m, LD_Fp80m)
+ if (DefMI->getOpcode() != ExpectedLoadOpcode)
+ return Result;
+
+ // Check that this is a simple load - not volatile, not atomic, etc.
+ // FP loads have hasSideEffects = 0 in their definition for simple loads
+ if (DefMI->hasOrderedMemoryRef())
+ return Result;
+
+ // The load should have a single def (the destination register) and memory operands
+ // Format: %reg = LD_Fpxxm <fi#N>, 1, %noreg, 0, %noreg
+ // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment
+ if (DefMI->getNumOperands() < 6)
+ return Result;
+
+ // Operand 0 is the destination, operands 1-5 are the memory reference
+ MachineOperand &BaseMO = DefMI->getOperand(1);
+ MachineOperand &ScaleMO = DefMI->getOperand(2);
+ MachineOperand &IndexMO = DefMI->getOperand(3);
+ MachineOperand &DispMO = DefMI->getOperand(4);
+ MachineOperand &SegMO = DefMI->getOperand(5);
+
+ // Check if this is a frame index load
+ if (BaseMO.isFI()) {
+ Result.IsValid = true;
+ Result.IsFrameIndex = true;
+ Result.FrameIndex = BaseMO.getIndex();
+ Result.ScaleVal = ScaleMO.getImm();
+ Result.IndexReg = IndexMO.getReg();
+ Result.Disp = DispMO.getImm();
+ Result.SegReg = SegMO.getReg();
+ return Result;
+ }
+
+ // Check if this is a constant pool load
+ // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg
+ if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
+ ScaleMO.isImm() && IndexMO.isReg() &&
+ IndexMO.getReg() == X86::NoRegister &&
+ DispMO.isCPI() && SegMO.isReg()) {
+ Result.IsValid = true;
+ Result.IsConstantPool = true;
+ Result.ConstantPoolIndex = DispMO.getIndex();
+ Result.ScaleVal = ScaleMO.getImm();
+ Result.IndexReg = IndexMO.getReg();
+ Result.Disp = 0;
+ Result.SegReg = SegMO.getReg();
+ return Result;
+ }
+
+ // Check if this is a global variable load
+ // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg
+ if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
+ ScaleMO.isImm() && IndexMO.isReg() &&
+ IndexMO.getReg() == X86::NoRegister &&
+ DispMO.isGlobal() && SegMO.isReg()) {
+ Result.IsValid = true;
+ Result.IsGlobal = true;
+ Result.Global = DispMO.getGlobal();
+ Result.GlobalOffset = DispMO.getOffset();
+ Result.ScaleVal = ScaleMO.getImm();
+ Result.IndexReg = IndexMO.getReg();
+ Result.Disp = 0;
+ Result.SegReg = SegMO.getReg();
+ return Result;
+ }
+
+ // Regular memory operands (e.g., pointer loads)
+ if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() &&
+ DispMO.isImm() && SegMO.isReg()) {
+ Result.IsValid = true;
+ Result.IsFrameIndex = false;
+ Result.IsConstantPool = false;
+ Result.BaseReg = BaseMO.getReg();
+ Result.ScaleVal = ScaleMO.getImm();
+ Result.IndexReg = IndexMO.getReg();
+ Result.Disp = DispMO.getImm();
+ Result.SegReg = SegMO.getReg();
+ return Result;
+ }
+
+ return Result;
+}
+
+static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
+ MachineBasicBlock *BB,
+ unsigned pseudoInstr) {
+ const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+ const MIMetadata MIMD(MI);
+ MachineFunction *MF = BB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ unsigned RegSizeInByte = 4;
+
+ // Get operands
+ // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80, %cond:i8imm
+ unsigned DestReg = MI.getOperand(0).getReg();
+ unsigned FalseReg = MI.getOperand(1).getReg();
+ unsigned TrueReg = MI.getOperand(2).getReg();
+ X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(3).getImm());
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+ // Materialize condition byte from EFLAGS
+ Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC);
+
+ auto storeFpToSlot = [&](unsigned Opcode, int Slot, Register Reg) {
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(Opcode)), Slot)
+ .addReg(Reg, RegState::Kill);
+ };
+
+ // Helper to load integer from memory operands
+ auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps,
+ unsigned Offset) -> unsigned {
+ unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg);
+
+ if (MemOps.IsFrameIndex) {
+ // Frame index: addFrameIndex + scale + index + disp + segment
+ MIB.addFrameIndex(MemOps.FrameIndex)
+ .addImm(MemOps.ScaleVal)
+ .addReg(MemOps.IndexReg)
+ .addImm(MemOps.Disp + Offset)
+ .addReg(MemOps.SegReg);
+ } else if (MemOps.IsConstantPool) {
+ // Constant pool: base_reg + scale + index + CP_index + segment
+ // MOV32rm format: base, scale, index, displacement, segment
+ MIB.addReg(X86::NoRegister) // Base register
+ .addImm(MemOps.ScaleVal) // Scale
+ .addReg(MemOps.IndexReg) // Index register
+ .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset) // Displacement (CP index)
+ .addReg(MemOps.SegReg); // Segment
+ } else if (MemOps.IsGlobal) {
+ // Global variable: base_reg + scale + index + global + segment
+ // MOV32rm format: base, scale, index, displacement, segment
+ MIB.addReg(X86::NoRegister) // Base register
+ .addImm(MemOps.ScaleVal) // Scale
+ .addReg(MemOps.IndexReg) // Index register
+ .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset) // Displacement (global address)
+ .addReg(MemOps.SegReg); // Segment
+ } else {
+ // Regular memory: base_reg + scale + index + disp + segment
+ MIB.addReg(MemOps.BaseReg)
+ .addImm(MemOps.ScaleVal)
+ .addReg(MemOps.IndexReg)
+ .addImm(MemOps.Disp + Offset)
+ .addReg(MemOps.SegReg);
+ }
+
+ return IntReg;
+ };
+
+ // Optimized path: load integers directly from memory when both operands are
+ // memory loads, avoiding FP register round-trip
+ auto emitCtSelectFromMemory = [&](unsigned NumValues,
+ const FPLoadMemOperands &TrueMemOps,
+ const FPLoadMemOperands &FalseMemOps,
+ int ResultSlot) {
+ for (unsigned Val = 0; Val < NumValues; ++Val) {
+ unsigned Offset = Val * RegSizeInByte;
+
+ // Load true and false values directly from their memory locations as integers
+ unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset);
+ unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset);
+
+ // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+ unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+
+ BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
+ .addDef(ResultIntReg) // dst (output)
+ .addDef(TmpByteReg) // tmp_byte (output)
+ .addDef(TmpMaskReg) // tmp_mask (output)
+ .addReg(FalseIntReg) // src1 (input) - false value
+ .addReg(TrueIntReg) // src2 (input) - true value
+ .addReg(CondByteReg); // pre-materialized condition byte (input)
+
+ // Store result back to result slot
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+ .addFrameIndex(ResultSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0)
+ .addReg(ResultIntReg, RegState::Kill);
+ }
+ };
+
+ auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) {
+ for (unsigned Val = 0; Val < NumValues; ++Val) {
+ unsigned Offset = Val * RegSizeInByte;
+
+ // Load true and false values from stack as 32-bit integers
+ unsigned TrueIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueIntReg)
+ .addFrameIndex(TrueSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0);
+
+ unsigned FalseIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), FalseIntReg)
+ .addFrameIndex(FalseSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0);
+
+ // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+ unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+
+ BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
+ .addDef(ResultIntReg) // dst (output)
+ .addDef(TmpByteReg) // tmp_byte (output)
+ .addDef(TmpMaskReg) // tmp_mask (output)
+ .addReg(FalseIntReg) // src1 (input) - false value
+ .addReg(TrueIntReg) // src2 (input) - true value
+ .addReg(CondByteReg); // pre-materialized condition byte (input)
+
+ // Store result back to result slot
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+ .addFrameIndex(ResultSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0)
+ .addReg(ResultIntReg, RegState::Kill);
+ }
+ };
+
+ switch (pseudoInstr) {
+ case X86::CTSELECT_I386_FP32rr: {
+ // Check if both operands are simple memory loads
+ FPLoadMemOperands TrueMemOps =
+ getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp32m);
+ FPLoadMemOperands FalseMemOps =
+ getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp32m);
+
+ int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+
+ if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+ // Optimized path: load directly from memory as integers
+ // Works for both frame index loads (stack parameters) and
+ // constant pool loads (constants)
+ emitCtSelectFromMemory(1, TrueMemOps, FalseMemOps, ResultSlot);
+
+ // Erase the original FP load instructions since we're not using them
+ // and have loaded the data directly as integers instead
+ if (MRI.hasOneUse(TrueReg)) {
+ if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+ TrueDefMI->eraseFromParent();
+ }
+ if (MRI.hasOneUse(FalseReg)) {
+ if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+ FalseDefMI->eraseFromParent();
+ }
+ } else {
+ // General path: spill FP registers to stack first
+ int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+ int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+
+ storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg);
+ storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg);
+
+ emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot);
+ }
+
+ // Load result back as f32
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg),
+ ResultSlot);
+ break;
+ }
+ case X86::CTSELECT_I386_FP64rr: {
+ unsigned StackSlotSize = 8;
+
+ // Check if both operands are simple memory loads
+ FPLoadMemOperands TrueMemOps =
+ getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp64m);
+ FPLoadMemOperands FalseMemOps =
+ getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp64m);
+
+ int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+
+ if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+ // Optimized path: load directly from memory as integers
+ // Works for both frame index loads (stack parameters) and
+ // constant pool loads (constants)
+ emitCtSelectFromMemory(StackSlotSize / RegSizeInByte, TrueMemOps,
+ FalseMemOps, ResultSlot);
+
+ // Erase the original FP load instructions since we're not using them
+ if (MRI.hasOneUse(TrueReg)) {
+ if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+ TrueDefMI->eraseFromParent();
+ }
+ if (MRI.hasOneUse(FalseReg)) {
+ if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+ FalseDefMI->eraseFromParent();
+ }
+ } else {
+ // General path: spill FP registers to stack first
+ int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+ int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+
+ storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg);
+ storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg);
+
+ emitCtSelectWithPseudo(StackSlotSize / RegSizeInByte, TrueSlot, FalseSlot,
+ ResultSlot);
+ }
+
+ // Load result back as f64
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg),
+ ResultSlot);
+ break;
+ }
+ case X86::CTSELECT_I386_FP80rr: {
+ // f80 is 80 bits (10 bytes), but stored with 12-byte alignment
+ unsigned StackObjectSize = 12;
+
+ // Check if both operands are simple memory loads
+ FPLoadMemOperands TrueMemOps =
+ getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp80m);
+ FPLoadMemOperands FalseMemOps =
+ getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp80m);
+
+ int ResultSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+
+ if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+ // Optimized path: load directly from memory as integers
+ // Works for both frame index loads (stack parameters) and
+ // constant pool loads (constants)
+ emitCtSelectFromMemory(StackObjectSize / RegSizeInByte, TrueMemOps,
+ FalseMemOps, ResultSlot);
+
+ // Erase the original FP load instructions since we're not using them
+ if (MRI.hasOneUse(TrueReg)) {
+ if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+ TrueDefMI->eraseFromParent();
+ }
+ if (MRI.hasOneUse(FalseReg)) {
+ if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+ FalseDefMI->eraseFromParent();
+ }
+ } else {
+ // General path: spill FP registers to stack first
+ int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+ int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+
+ storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg);
+ storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg);
+
+ emitCtSelectWithPseudo(StackObjectSize / RegSizeInByte, TrueSlot,
+ FalseSlot, ResultSlot);
+ }
+
+ // Load result back as f80
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg),
+ ResultSlot);
+ break;
+ }
+ default:
+ llvm_unreachable("Invalid CTSELECT opcode");
+ }
+
+ MI.eraseFromParent();
+
+ return BB;
+}
+
MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const {
@@ -37734,6 +38465,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::CMOV_VK64:
return EmitLoweredSelect(MI, BB);
+ case X86::CTSELECT_I386_GR8rr:
+ return emitCTSelectI386WithConditionMaterialization(
+ MI, BB, X86::CTSELECT_I386_INT_GR8rr);
+
+ case X86::CTSELECT_I386_GR16rr:
+ return emitCTSelectI386WithConditionMaterialization(
+ MI, BB, X86::CTSELECT_I386_INT_GR16rr);
+
+ case X86::CTSELECT_I386_GR32rr:
+ return emitCTSelectI386WithConditionMaterialization(
+ MI, BB, X86::CTSELECT_I386_INT_GR32rr);
+
+ case X86::CTSELECT_I386_FP32rr:
+ return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP32rr);
+ case X86::CTSELECT_I386_FP64rr:
+ return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP64rr);
+ case X86::CTSELECT_I386_FP80rr:
+ return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP80rr);
+
case X86::FP80_ADDr:
case X86::FP80_ADDm32: {
// Change the floating point control register to use double extended
@@ -41695,7 +42445,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
return SDValue();
- Imm = llvm::rotl<uint8_t>(Imm, 4);
+ Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
DAG.getTargetConstant(Imm, DL, MVT::i8));
};
@@ -44662,16 +45412,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
- if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) {
- // iff we only need the signbit then we can use R directly.
- if (OriginalDemandedBits.isSignMask())
- return TLO.CombineTo(Op, Op.getOperand(1));
- // otherwise we just need R's signbit for the comparison.
- APInt SignMask = APInt::getSignMask(BitWidth);
- if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts,
- Known, TLO, Depth + 1))
- return true;
- }
+ // iff we only need the sign bit then we can use R directly.
+ if (OriginalDemandedBits.isSignMask() &&
+ ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
+ return TLO.CombineTo(Op, Op.getOperand(1));
break;
case X86ISD::MOVMSK: {
SDValue Src = Op.getOperand(0);
@@ -47581,15 +48325,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
DL, DAG, Subtarget))
return V;
- // If the sign bit is known then BLENDV can be folded away.
- if (N->getOpcode() == X86ISD::BLENDV) {
- KnownBits KnownCond = DAG.computeKnownBits(Cond);
- if (KnownCond.isNegative())
- return LHS;
- if (KnownCond.isNonNegative())
- return RHS;
- }
-
if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
SmallVector<int, 64> CondMask;
if (createShuffleMaskFromVSELECT(CondMask, Cond,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..d759895719388 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -114,6 +114,10 @@ namespace llvm {
/// X86 Select
SELECTS,
+ /// X86 Constant-time Select, implemented with CMOV instruction. This is
+ /// used to implement constant-time select.
+ CTSELECT,
+
// Same as SETCC except it's materialized with a sbb and the value is all
// one's or all zero's.
SETCC_CARRY, // R = carry_bit ? ~0 : 0
@@ -1139,6 +1143,8 @@ namespace llvm {
///
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ bool isSelectSupported(SelectSupportKind Kind) const override;
+
/// Replace the results of node with an illegal result
/// type with new values built out of custom code.
///
@@ -1765,6 +1771,7 @@ namespace llvm {
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 7d5d7cf4a83ab..9c34889f03354 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -106,6 +106,211 @@ let Predicates = [HasCMOV, HasNDD] in {
def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
(CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
}
+
+// Create pseudo instruction and do the pattern matching to them.
+// We use a machine pass to lower these pseudos into cmov, in order
+// to avoid backend optimizations
+let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in {
+
+ multiclass CTSELECT<X86TypeInfo t> {
+ // register-only
+ let isCommutable = 0, SchedRW = [WriteCMOV], Predicates = [HasNativeCMOV],
+ AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
+ def rr : PseudoI<(outs t.RegClass:$dst),
+ (ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond),
+ [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, t.RegClass:$src2, timm:$cond, EFLAGS))]>;
+ }
+
+ // register-memory
+ let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], Predicates = [HasNativeCMOV],
+ AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
+ def rm : PseudoI<(outs t.RegClass:$dst),
+ (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond),
+ [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>;
+ }
+ }
+}
+
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+ let Constraints = "$dst = $src1" in {
+ defm CTSELECT16 : CTSELECT<Xi16>;
+ defm CTSELECT32 : CTSELECT<Xi32>;
+ defm CTSELECT64 : CTSELECT<Xi64>;
+ }
+}
+
+// CTSELECT_VEC base class
+class CTSELECT_VEC<RegisterClass VRc, RegisterClass GRc>
+ : PseudoI<
+ (outs VRc:$dst, VRc:$tmpx, GRc:$tmpg),
+ (ins VRc:$t, VRc:$f, i8imm:$cond),
+ []
+ > {
+ let Uses = [EFLAGS];
+ let isPseudo = 1;
+ let isNotDuplicable = 1;
+ let hasSideEffects = 1;
+ let AsmString = "ctselect\t$dst, $f, $t, $cond";
+ let SchedRW = [];
+}
+
+// Width-specific class aliases
+class CTSELECT_VEC128 : CTSELECT_VEC<VR128, GR32>;
+class CTSELECT_VEC128X : CTSELECT_VEC<VR128X, GR32>;
+class CTSELECT_VEC256 : CTSELECT_VEC<VR256, GR32>;
+class CTSELECT_VEC512 : CTSELECT_VEC<VR512, GR32>;
+
+
+//===----------------------------------------------------------------------===//
+// 128-bit pseudos (SSE2 baseline; we use PXOR/PAND/MOVD/PSHUFD in the expander)
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE1] in {
+
+ def CTSELECT_V4F32 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+}
+
+let Predicates = [HasSSE2] in {
+
+ def CTSELECT_V2F64 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V4I32 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V2I64 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V8I16 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V16I8 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+
+ // If your build has v8f16, keep this; otherwise comment it out.
+ def CTSELECT_V8F16 : CTSELECT_VEC128 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+}
+
+let Predicates = [HasAVX] in {
+
+ def CTSELECT_V4F32X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V2F64X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V4I32X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V2I64X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V8I16X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V16I8X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+
+ // If your build has v8f16, keep this; otherwise comment it out.
+ def CTSELECT_V8F16X : CTSELECT_VEC128X {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// 256-bit pseudos
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVX] in {
+
+ def CTSELECT_V8F32 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V4F64 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V8I32 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V4I64 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V16I16 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+ def CTSELECT_V32I8 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+
+ // If your build has v16f16, keep this; otherwise comment it out.
+ def CTSELECT_V16F16 : CTSELECT_VEC256 {
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Selection patterns: X86ctselect(...), EFLAGS -> CTSELECT_V*
+//
+// NOTE:
+// * The SDNode carries Glue from CMP/TEST (due to SDNPInGlue).
+// * We list EFLAGS explicitly in the pattern (X86 style) to model the arch read.
+// * Temps (tmpx/tmpy,tmpg) are not in the pattern; they’re outs allocated by RA.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE1] in {
+
+ // 128-bit float (bitwise-equivalent ops in expander)
+ def : Pat<(v4f32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V4F32 VR128:$t, VR128:$f, timm:$cc)>;
+}
+
+let Predicates = [HasSSE2] in {
+
+ // 128-bit integer
+ def : Pat<(v4i32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V4I32 VR128:$t, VR128:$f, timm:$cc)>;
+ def : Pat<(v2i64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V2I64 VR128:$t, VR128:$f, timm:$cc)>;
+ def : Pat<(v8i16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V8I16 VR128:$t, VR128:$f, timm:$cc)>;
+ def : Pat<(v16i8 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V16I8 VR128:$t, VR128:$f, timm:$cc)>;
+ def : Pat<(v2f64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V2F64 VR128:$t, VR128:$f, timm:$cc)>;
+
+ // 128-bit f16 (optional)
+ def : Pat<(v8f16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V8F16 VR128:$t, VR128:$f, timm:$cc)>;
+}
+
+let Predicates = [HasAVX] in {
+
+ // 256-bit integer
+ def : Pat<(v8i32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V8I32 VR256:$t, VR256:$f, timm:$cc)>;
+ def : Pat<(v4i64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V4I64 VR256:$t, VR256:$f, timm:$cc)>;
+ def : Pat<(v16i16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V16I16 VR256:$t, VR256:$f, timm:$cc)>;
+ def : Pat<(v32i8 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V32I8 VR256:$t, VR256:$f, timm:$cc)>;
+
+ // 256-bit float (bitwise-equivalent ops in expander)
+ def : Pat<(v8f32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V8F32 VR256:$t, VR256:$f, timm:$cc)>;
+ def : Pat<(v4f64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V4F64 VR256:$t, VR256:$f, timm:$cc)>;
+
+ // 256-bit f16 (optional)
+ def : Pat<(v16f16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+ (CTSELECT_V16F16 VR256:$t, VR256:$f, timm:$cc)>;
+}
+
let Predicates = [HasCMOV, HasCF] in {
def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS),
(CFCMOV16rr GR16:$src1, (inv_cond_XFORM timm:$cond))>;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ec31675731b79..d40c91b52c808 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -693,6 +693,87 @@ def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
(CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+// CTSELECT
+// Enhanced CTSELECT pseudos for i386 with temporary register allocation
+// These use a two-phase approach:
+// 1. Custom inserter materializes condition byte from EFLAGS
+// 2. Post-RA expansion generates constant-time instruction bundles
+
+let isPseudo = 1, isNotDuplicable = 1 in {
+ // Phase 1: Initial pseudos that consume EFLAGS (via custom inserter)
+ // These are matched by patterns and convert EFLAGS to condition byte
+ class CTSELECT_I386_INITIAL<RegisterClass RC, ValueType VT>
+ : PseudoI<(outs RC:$dst),
+ (ins RC:$src1, RC:$src2, i8imm:$cond),
+ [(set RC:$dst, (VT(X86ctselect RC:$src1, RC:$src2, timm:$cond,
+ EFLAGS)))]> {
+ let Uses = [EFLAGS];
+ let Defs = [EFLAGS];
+ let usesCustomInserter = 1;
+ let hasNoSchedulingInfo = 1;
+ }
+
+ // Phase 2: Internal pseudos with pre-materialized condition byte (post-RA expansion)
+ // These generate the actual constant-time instruction bundles
+ class CTSELECT_I386_INTERNAL<RegisterClass RC, RegisterClass ByteRC>
+ : PseudoI<(outs RC:$dst, ByteRC:$tmp_byte, RC:$tmp_mask),
+ (ins RC:$src1, RC:$src2, ByteRC:$cond_byte), []> {
+ let hasNoSchedulingInfo = 1;
+ let Constraints = "@earlyclobber $dst, at earlyclobber $tmp_byte, at earlyclobber $tmp_mask";
+ let Defs = [EFLAGS]; // NEG instruction in post-RA expansion clobbers EFLAGS
+ }
+}
+
+// Phase 1 pseudos for non-CMOV targets (custom inserter materializes condition)
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+ let Predicates = [NoNativeCMOV] in {
+ def CTSELECT_I386_GR8rr : CTSELECT_I386_INITIAL<GR8, i8>;
+ def CTSELECT_I386_GR16rr : CTSELECT_I386_INITIAL<GR16, i16>;
+ def CTSELECT_I386_GR32rr : CTSELECT_I386_INITIAL<GR32, i32>;
+ }
+}
+
+// Phase 2 pseudos (post-RA expansion with pre-materialized condition byte)
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+ let Predicates = [NoNativeCMOV] in {
+ def CTSELECT_I386_INT_GR8rr :
+ CTSELECT_I386_INTERNAL<GR8, GR8>;
+ def CTSELECT_I386_INT_GR16rr :
+ CTSELECT_I386_INTERNAL<GR16, GR8>;
+ def CTSELECT_I386_INT_GR32rr :
+ CTSELECT_I386_INTERNAL<GR32, GR8>;
+ }
+}
+
+let hasSideEffects = 1,
+ ForceDisassemble = 1,
+ Constraints = "$dst = $src1" in {
+
+ let Predicates = [FPStackf32] in
+ def CTSELECT_I386_FP32rr : CTSELECT_I386_INITIAL<RFP32, f32>;
+
+ let Predicates = [FPStackf64] in
+ def CTSELECT_I386_FP64rr : CTSELECT_I386_INITIAL<RFP64, f64>;
+
+ def CTSELECT_I386_FP80rr : CTSELECT_I386_INITIAL<RFP80, f80>;
+}
+
+// Pattern matching for non-native-CMOV CTSELECT (routes to custom inserter for condition materialization)
+// NoNativeCMOV ensures these patterns are used when actual CMOV instruction is not available
+// even if canUseCMOV() is true (e.g., i386 with SSE which can emulate CMOV)
+let Predicates = [NoNativeCMOV] in {
+ def : Pat<(i8(X86ctselect GR8:$src1, GR8:$src2, timm:$cond, EFLAGS)),
+ (CTSELECT_I386_GR8rr GR8:$src1, GR8:$src2, timm:$cond)>;
+
+ def : Pat<(i16(X86ctselect GR16:$src1, GR16:$src2, timm:$cond, EFLAGS)),
+ (CTSELECT_I386_GR16rr GR16:$src1, GR16:$src2, timm:$cond)>;
+
+ def : Pat<(i32(X86ctselect GR32:$src1, GR32:$src2, timm:$cond, EFLAGS)),
+ (CTSELECT_I386_GR32rr GR32:$src1, GR32:$src2, timm:$cond)>;
+
+ // i64 patterns handled automatically by type legalization
+}
+
//===----------------------------------------------------------------------===//
// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index 116986a0fffea..4c9e5bae3b46c 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -28,6 +28,10 @@ def SDTX86Cmov : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+def SDTX86CtSelect : SDTypeProfile<1, 4,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
// Unary and binary operator instructions that set EFLAGS as a side-effect.
def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
[SDTCisSameAs<0, 2>,
@@ -151,6 +155,7 @@ def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>;
def X86cload : SDNode<"X86ISD::CLOAD", SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def X86cstore : SDNode<"X86ISD::CSTORE", SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86ctselect: SDNode<"X86ISD::CTSELECT", SDTX86CtSelect, [SDNPInGlue]>;
def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
[SDNPHasChain]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 6b2a7a4ec3583..765db86ffafb3 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -475,6 +475,556 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
return false;
}
+struct CtSelectInstructions {
+ unsigned PAndOpc;
+ unsigned PAndnOpc;
+ unsigned POrOpc;
+ unsigned BroadcastOpc;
+ unsigned IntMoveOpc;
+ unsigned MoveOpc;
+ bool Use256;
+ bool UseBlendInstr;
+};
+
+static CtSelectInstructions
+getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) {
+ CtSelectInstructions Instructions = {};
+
+ switch (Opcode) {
+ case X86::CTSELECT_V2F64:
+ if (Subtarget.hasSSE2()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVAPDrr;
+ Instructions.UseBlendInstr = true;
+ } else {
+ llvm_unreachable("Double precision vectors require SSE2");
+ }
+ break;
+ case X86::CTSELECT_V4F32:
+ if (Subtarget.hasSSE41()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVAPSrr;
+ Instructions.UseBlendInstr = true;
+ } else if (Subtarget.hasSSE2()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVAPSrr;
+ } else {
+ // fallback to SSE1, only support four 32-bit single precision
+ // floating-point values
+ Instructions.PAndOpc = X86::ANDPSrr;
+ Instructions.PAndnOpc = X86::ANDNPSrr;
+ Instructions.POrOpc = X86::ORPSrr;
+ Instructions.BroadcastOpc = X86::SHUFPSrri;
+ Instructions.IntMoveOpc = X86::MOVSS2DIrr;
+ Instructions.MoveOpc = X86::MOVAPSrr;
+ }
+ break;
+ case X86::CTSELECT_V4I32:
+ case X86::CTSELECT_V2I64:
+ case X86::CTSELECT_V8I16:
+ case X86::CTSELECT_V16I8:
+ if (Subtarget.hasSSE2()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVDQArr;
+ } else {
+ llvm_unreachable("Integer vector operations require SSE2");
+ }
+ break;
+ case X86::CTSELECT_V8F16:
+ if (Subtarget.hasSSE2()) {
+ Instructions.PAndOpc = X86::PANDrr;
+ Instructions.PAndnOpc = X86::PANDNrr;
+ Instructions.POrOpc = X86::PORrr;
+ Instructions.BroadcastOpc = X86::PSHUFDri;
+ Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+ Instructions.MoveOpc = X86::MOVDQArr;
+ } else {
+ llvm_unreachable("FP16 vector operations require SSE2");
+ }
+ break;
+ case X86::CTSELECT_V4F32X:
+ case X86::CTSELECT_V4I32X:
+ case X86::CTSELECT_V2F64X:
+ case X86::CTSELECT_V2I64X:
+ case X86::CTSELECT_V8I16X:
+ case X86::CTSELECT_V16I8X:
+ case X86::CTSELECT_V8F16X:
+ if (Subtarget.hasAVX()) {
+ Instructions.PAndOpc = X86::VPANDrr;
+ Instructions.PAndnOpc = X86::VPANDNrr;
+ Instructions.POrOpc = X86::VPORrr;
+ Instructions.BroadcastOpc = X86::VPSHUFDri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc = (Opcode == X86::CTSELECT_V4F32X) ? X86::VMOVAPSrr
+ : (Opcode == X86::CTSELECT_V2F64X)
+ ? X86::VMOVAPDrr
+ : X86::VMOVDQArr;
+ } else {
+ llvm_unreachable("AVX variants require AVX support");
+ }
+ break;
+ case X86::CTSELECT_V8F32:
+ case X86::CTSELECT_V8I32:
+ if (Subtarget.hasAVX()) {
+ Instructions.PAndOpc = X86::VPANDYrr;
+ Instructions.PAndnOpc = X86::VPANDNYrr;
+ Instructions.POrOpc = X86::VPORYrr;
+ Instructions.BroadcastOpc = X86::VPERMILPSYri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc =
+ (Opcode == X86::CTSELECT_V8F32) ? X86::VMOVAPSYrr : X86::VMOVDQAYrr;
+ Instructions.Use256 = true;
+ } else {
+ llvm_unreachable("256-bit vectors require AVX");
+ }
+ break;
+ case X86::CTSELECT_V4F64:
+ case X86::CTSELECT_V4I64:
+ if (Subtarget.hasAVX()) {
+ Instructions.PAndOpc = X86::VPANDYrr;
+ Instructions.PAndnOpc = X86::VPANDNYrr;
+ Instructions.POrOpc = X86::VPORYrr;
+ Instructions.BroadcastOpc = X86::VPERMILPDYri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc =
+ (Opcode == X86::CTSELECT_V4F64) ? X86::VMOVAPDYrr : X86::VMOVDQAYrr;
+ Instructions.Use256 = true;
+ } else {
+ llvm_unreachable("256-bit vectors require AVX");
+ }
+ break;
+ case X86::CTSELECT_V16I16:
+ case X86::CTSELECT_V32I8:
+ case X86::CTSELECT_V16F16:
+ if (Subtarget.hasAVX2()) {
+ Instructions.PAndOpc = X86::VPANDYrr;
+ Instructions.PAndnOpc = X86::VPANDNYrr;
+ Instructions.POrOpc = X86::VPORYrr;
+ Instructions.BroadcastOpc = X86::VPERMILPSYri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc = X86::VMOVDQAYrr;
+ Instructions.Use256 = true;
+ } else if (Subtarget.hasAVX()) {
+ Instructions.PAndOpc = X86::VPANDYrr;
+ Instructions.PAndnOpc = X86::VPANDNYrr;
+ Instructions.POrOpc = X86::VPORYrr;
+ Instructions.BroadcastOpc = X86::VPERMILPSYri;
+ Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+ Instructions.MoveOpc = X86::VMOVDQAYrr;
+ Instructions.Use256 = true;
+ } else {
+ llvm_unreachable("256-bit integer vectors require AVX");
+ }
+ break;
+ default:
+ llvm_unreachable("Unexpected CTSELECT opcode");
+ }
+
+ return Instructions;
+}
+
+bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
+ unsigned Opcode = MI.getOpcode();
+ const DebugLoc &DL = MI.getDebugLoc();
+ auto Instruction = getCtSelectInstructions(Opcode, Subtarget);
+
+ MachineBasicBlock *MBB = MI.getParent();
+
+ // Operand layout matches the TableGen definition:
+ // (outs VR128:$dst, VR128:$tmpx, GR32:$tmpg),
+ // (ins VR128:$t, VR128:$f, i8imm:$cond)
+ Register Dst = MI.getOperand(0).getReg();
+ Register MaskReg = MI.getOperand(1).getReg(); // vector mask temp
+ Register TmpGPR = MI.getOperand(2).getReg(); // scalar mask temp (GPR32)
+ Register FalseVal = MI.getOperand(3).getReg(); // true_value
+ Register TrueVal = MI.getOperand(4).getReg(); // false_value
+ X86::CondCode CC = X86::CondCode(MI.getOperand(5).getImm()); // condition
+
+ MachineInstr *FirstInstr = nullptr;
+ MachineInstr *LastInstr = nullptr;
+ auto recordInstr = [&](MachineInstrBuilder MIB) {
+ MachineInstr *NewMI = MIB.getInstr();
+ LastInstr = NewMI;
+ if (!FirstInstr)
+ FirstInstr = NewMI;
+ };
+
+ // Create scalar mask in tempGPR and broadcast to vector mask
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR)
+ .addImm(0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ auto SubReg = TRI->getSubReg(TmpGPR, X86::sub_8bit);
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::SETCCr))
+ .addReg(SubReg)
+ .addImm(CC)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // Zero-extend byte to 32-bit register (movzbl %al, %eax)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVZX32rr8), TmpGPR)
+ .addReg(SubReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) {
+ // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31,
+ // %eax)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR)
+ .addReg(TmpGPR)
+ .addImm(31));
+ } else {
+ // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR)
+ .addReg(TmpGPR));
+ }
+
+ // Broadcast to TmpX (vector mask)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg)
+ .addReg(MaskReg)
+ .addReg(MaskReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // Move scalar mask to vector register
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.IntMoveOpc), MaskReg)
+ .addReg(TmpGPR)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ if (Instruction.Use256) {
+ // Broadcast to 256-bit vector register
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+ .addReg(MaskReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ } else {
+ if (Subtarget.hasSSE2() || Subtarget.hasAVX()) {
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+ .addReg(MaskReg)
+ .addImm(0x00)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ } else {
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+ .addReg(MaskReg)
+ .addReg(MaskReg)
+ .addImm(0x00)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ }
+ }
+
+ if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) {
+ // Use dedicated blend instructions for SSE4.1+
+ unsigned BlendOpc;
+ switch (Opcode) {
+ case X86::CTSELECT_V4F32:
+ BlendOpc = X86::BLENDVPSrr0;
+ break;
+ case X86::CTSELECT_V2F64:
+ BlendOpc = X86::BLENDVPDrr0;
+ break;
+ default:
+ // alias for pblendvb that takes xmm0 as implicit mask register
+ BlendOpc = X86::PBLENDVBrr0;
+ break;
+ }
+
+ // Check if XMM0 is used as one of source registers, if yes then save it
+ // in Dst register and update FalseVal and TrueVal to Dst register
+ bool DidSaveXMM0 = false;
+ Register SavedXMM0 = X86::XMM0;
+ if (FalseVal == X86::XMM0 || TrueVal == X86::XMM0) {
+ Register SrcXMM0 = (FalseVal == X86::XMM0) ? FalseVal : TrueVal;
+
+ // if XMM0 is one of the source registers, it will not match with Dst
+ // registers, so we need to move it to Dst register
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(SrcXMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // update FalseVal and TrueVal to Dst register
+ if (FalseVal == X86::XMM0)
+ FalseVal = Dst;
+ if (TrueVal == X86::XMM0)
+ TrueVal = Dst;
+
+ // update SavedXMM0 to Dst register
+ SavedXMM0 = Dst;
+
+ // set DidSaveXMM0 to true to indicate that we saved XMM0 into Dst
+ // register
+ DidSaveXMM0 = true;
+ } else if (MaskReg != X86::XMM0 && Dst != X86::XMM0) {
+
+ // if XMM0 is not allocated for any of the register, we stil need to save
+ // and restore it after using as mask register
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(X86::XMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ SavedXMM0 = Dst;
+ DidSaveXMM0 = true;
+ }
+
+ if (MaskReg != X86::XMM0) {
+ // BLENDV uses XMM0 as implicit mask register
+ // https://www.felixcloutier.com/x86/pblendvb
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge));
+
+ // move FalseVal to mask (use MaskReg as the dst of the blend)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), MaskReg)
+ .addReg(FalseVal)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // MaskReg := blend(MaskReg /*false*/, TrueVal /*true*/) ; mask in
+ // xmm0
+ recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), MaskReg)
+ .addReg(MaskReg)
+ .addReg(TrueVal)
+ .addReg(X86::XMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // restore XMM0 from SavedXMM0 if we saved it into Dst
+ if (DidSaveXMM0) {
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0)
+ .addReg(SavedXMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ }
+ // dst = result (now in MaskReg)
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(MaskReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ } else {
+ // move FalseVal to Dst register since MaskReg is XMM0 and Dst is not
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(FalseVal)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // Dst := blend(Dst /*false*/, TrueVal /*true*/) ; mask in
+ // xmm0
+ recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), Dst)
+ .addReg(Dst)
+ .addReg(TrueVal)
+ .addReg(X86::XMM0)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ }
+ } else {
+
+ // dst = mask
+ recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+ .addReg(MaskReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // mask &= true_val
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDrr), MaskReg)
+ .addReg(MaskReg)
+ .addReg(TrueVal)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // dst = ~mask & false_val
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDNrr), Dst)
+ .addReg(Dst)
+ .addReg(FalseVal)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+ // dst |= mask; (mask & t) | (~mask & f)
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::PORrr), Dst)
+ .addReg(Dst)
+ .addReg(MaskReg)
+ .setMIFlags(MachineInstr::MIFlag::NoMerge));
+ }
+
+ assert(FirstInstr && LastInstr && "Expected at least one expanded instruction");
+ auto BundleEnd = LastInstr->getIterator();
+ finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd));
+
+ MI.eraseFromParent();
+
+ return true;
+}
+
+bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // CTSELECT pseudo has: (outs dst), (ins true_val, false_val, cond)
+ MachineOperand &OperandRes = MI.getOperand(0); // destination register
+ MachineOperand &OperandTrue = MI.getOperand(1); // true value
+ MachineOperand &OperandCond = MI.getOperand(3); // condition code
+
+ assert(OperandTrue.isReg() && OperandRes.isReg() && OperandCond.isImm() &&
+ "Invalid operand types");
+ assert(OperandTrue.getReg() == OperandRes.getReg() &&
+ "Result register different from True register");
+
+ assert(Subtarget.hasCMOV() && "target does not support CMOV instructions");
+
+ unsigned Opcode = 0;
+
+ switch (MI.getOpcode()) {
+ case X86::CTSELECT16rr:
+ Opcode = X86::CMOV16rr;
+ break;
+ case X86::CTSELECT32rr:
+ Opcode = X86::CMOV32rr;
+ break;
+ case X86::CTSELECT64rr:
+ Opcode = X86::CMOV64rr;
+ break;
+ case X86::CTSELECT16rm:
+ Opcode = X86::CMOV16rm;
+ break;
+ case X86::CTSELECT32rm:
+ Opcode = X86::CMOV32rm;
+ break;
+ case X86::CTSELECT64rm:
+ Opcode = X86::CMOV64rm;
+ break;
+ default:
+ llvm_unreachable("Invalid CTSELECT opcode");
+ }
+
+ if (!Subtarget.hasCMOV()) {
+ llvm_unreachable("target does not support cmov");
+ }
+
+ // Build CMOV instruction: copy the first 3 operands (dst, true, false)
+ // and add condition code
+ MachineInstrBuilder CmovBuilder = BuildMI(*MBB, MI, DL, get(Opcode));
+ for (unsigned i = 0u; i < MI.getNumOperands(); ++i) { // Copy
+ CmovBuilder.add(MI.getOperand(i));
+ }
+
+ // Remove the original CTSELECT instruction
+ MI.eraseFromParent();
+ return true;
+}
+
+/// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time)
+/// These internal pseudos receive a pre-materialized condition byte from the
+/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization.
+bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // CTSELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask),
+ // (ins src1, src2, cond_byte)
+ // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent
+ Register DstReg = MI.getOperand(0).getReg();
+ Register TmpByteReg = MI.getOperand(1).getReg();
+ Register TmpMaskReg = MI.getOperand(2).getReg();
+ Register Src1Reg = MI.getOperand(3).getReg();
+ Register Src2Reg = MI.getOperand(4).getReg();
+ Register CondByteReg = MI.getOperand(5).getReg(); // Pre-materialized condition byte
+
+ // Determine instruction opcodes based on register width
+ unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp;
+ if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) {
+ MovZXOp = 0; // No zero-extend needed for GR8
+ NegOp = X86::NEG8r;
+ MovOp = X86::MOV8rr;
+ AndOp = X86::AND8rr;
+ NotOp = X86::NOT8r;
+ OrOp = X86::OR8rr;
+ } else if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR16rr) {
+ MovZXOp = X86::MOVZX16rr8;
+ NegOp = X86::NEG16r;
+ MovOp = X86::MOV16rr;
+ AndOp = X86::AND16rr;
+ NotOp = X86::NOT16r;
+ OrOp = X86::OR16rr;
+ } else { // X86::CTSELECT_I386_INT_GR32rr
+ MovZXOp = X86::MOVZX32rr8;
+ NegOp = X86::NEG32r;
+ MovOp = X86::MOV32rr;
+ AndOp = X86::AND32rr;
+ NotOp = X86::NOT32r;
+ OrOp = X86::OR32rr;
+ }
+
+ // 7-instruction constant-time selection bundle (no SETCC inside):
+ // result = (true_val & mask) | (false_val & ~mask)
+ // The condition byte is already materialized, avoiding EFLAGS dependency
+
+ // Step 1: Copy pre-materialized condition byte to TmpByteReg
+ // This allows the bundle to work with allocated temporaries
+ auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg)
+ .addReg(CondByteReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+ auto BundleStart = I1->getIterator();
+
+ // Step 2: Zero-extend condition byte to register width (0 or 1)
+ if (MI.getOpcode() != X86::CTSELECT_I386_INT_GR8rr) {
+ BuildMI(*MBB, MI, DL, get(MovZXOp), TmpMaskReg)
+ .addReg(TmpByteReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+ }
+
+ // Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...)
+ Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg;
+ BuildMI(*MBB, MI, DL, get(NegOp), MaskReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Step 4,5: Apply mask to true value - copy src1 to dest, then AND with mask
+ BuildMI(*MBB, MI, DL, get(MovOp), DstReg)
+ .addReg(Src1Reg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ BuildMI(*MBB, MI, DL, get(AndOp), DstReg)
+ .addReg(DstReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Step 6: Create inverted mask inline (~mask)
+ BuildMI(*MBB, MI, DL, get(NotOp), MaskReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Step 7: Apply inverted mask to false value - reuse mask register directly
+ BuildMI(*MBB, MI, DL, get(AndOp), MaskReg)
+ .addReg(MaskReg)
+ .addReg(Src2Reg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Step 8: Final result: (src1 & mask) | (src2 & ~mask)
+ auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg)
+ .addReg(DstReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+ // Bundle all generated instructions for atomic execution before removing MI
+ auto BundleEnd = std::next(LI->getIterator());
+ if (BundleStart != BundleEnd) {
+ // Only bundle if we have multiple instructions
+ finalizeBundle(*MBB, BundleStart, BundleEnd);
+ }
+
+ // TODO: Optimization opportunity - The register allocator may choose callee-saved
+ // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary
+ // save/restore overhead. Consider constraining these to caller-saved register
+ // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve
+ // constant-time performance by eliminating prologue/epilogue instructions.
+
+ // Remove the original pseudo instruction
+ MI.eraseFromParent();
+ return true;
+}
+
static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) {
switch (Opcode) {
default:
@@ -6402,6 +6952,43 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::ADD64ri32_DB:
MIB->setDesc(get(X86::OR64ri32));
break;
+
+ case X86::CTSELECT64rr:
+ case X86::CTSELECT32rr:
+ case X86::CTSELECT16rr:
+ case X86::CTSELECT64rm:
+ case X86::CTSELECT32rm:
+ case X86::CTSELECT16rm:
+ // These CTSELECT pseudos are only selected when CMOV is available
+ // Pattern matching ensures we use CTSELECT_I386 when CMOV is not available
+ return expandCtSelectWithCMOV(MI);
+
+ // non-cmov CTSELECT expansion (post-RA, constant-time)
+ // These are the internal pseudos with pre-materialized condition byte
+ case X86::CTSELECT_I386_INT_GR8rr:
+ case X86::CTSELECT_I386_INT_GR16rr:
+ case X86::CTSELECT_I386_INT_GR32rr:
+ return expandCtSelectIntWithoutCMOV(MI);
+
+ case X86::CTSELECT_V2F64:
+ case X86::CTSELECT_V4F32:
+ case X86::CTSELECT_V2I64:
+ case X86::CTSELECT_V4I32:
+ case X86::CTSELECT_V8I16:
+ case X86::CTSELECT_V16I8:
+ case X86::CTSELECT_V2F64X:
+ case X86::CTSELECT_V4F32X:
+ case X86::CTSELECT_V2I64X:
+ case X86::CTSELECT_V4I32X:
+ case X86::CTSELECT_V8I16X:
+ case X86::CTSELECT_V16I8X:
+ case X86::CTSELECT_V4I64:
+ case X86::CTSELECT_V8I32:
+ case X86::CTSELECT_V16I16:
+ case X86::CTSELECT_V32I8:
+ case X86::CTSELECT_V4F64:
+ case X86::CTSELECT_V8F32:
+ return expandCtSelectVector(MI);
}
return false;
}
@@ -10800,27 +11387,39 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
if (!ST.hasSSE1())
return;
- BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
+ // PXOR is safe to use because it doesn't affect flags.
+ BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
} else if (X86::VR256RegClass.contains(Reg)) {
// YMM#
if (!ST.hasAVX())
return;
- BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
+ // VPXOR is safe to use because it doesn't affect flags.
+ BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
} else if (X86::VR512RegClass.contains(Reg)) {
// ZMM#
if (!ST.hasAVX512())
return;
- BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
+ // VPXORY is safe to use because it doesn't affect flags.
+ BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
} else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
X86::VK16RegClass.contains(Reg)) {
if (!ST.hasVLX())
return;
- unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
- BuildMI(MBB, Iter, DL, get(Op), Reg);
+ // KXOR is safe to use because it doesn't affect flags.
+ unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk;
+ BuildMI(MBB, Iter, DL, get(Op), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
}
}
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 5f75559bd9598..ebd7e070d5fe8 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -724,6 +724,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
int &FrameIndex) const;
+ /// Expand the CTSELECT pseudo-instructions.
+ bool expandCtSelectWithCMOV(MachineInstr &MI) const;
+ bool expandCtSelectIntWithoutCMOV(MachineInstr &MI) const;
+
+ bool expandCtSelectVector(MachineInstr &MI) const;
+
/// Returns true iff the routine could find two commutable operands in the
/// given machine instruction with 3 vector inputs.
/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 98104a6fad1a9..6b585a5b0b436 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -49,6 +49,11 @@ def HasZU : Predicate<"Subtarget->hasZU()">;
def HasCF : Predicate<"Subtarget->hasCF()">;
def HasCMOV : Predicate<"Subtarget->canUseCMOV()">;
def NoCMOV : Predicate<"!Subtarget->canUseCMOV()">;
+// Predicates for native CMOV instruction (checks hasCMOV(), not canUseCMOV())
+// HasCMOV may be true even without native CMOV (e.g., via SSE emulation)
+// Use HasNativeCMOV/NoNativeCMOV for constant-time code that requires actual CMOV
+def HasNativeCMOV : Predicate<"Subtarget->hasCMOV()">;
+def NoNativeCMOV : Predicate<"!Subtarget->hasCMOV()">;
def HasNOPL : Predicate<"Subtarget->hasNOPL()">;
def HasMMX : Predicate<"Subtarget->hasMMX()">;
def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 9a76abcd351bf..66c9d75053640 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -617,10 +617,11 @@ void X86PassConfig::addPreEmitPass2() {
// ObjC runtime functions present in the module.
const Function &F = MF.getFunction();
const Module *M = F.getParent();
- return M->getModuleFlag("kcfi") ||
+ return M->getModuleFlag("kcfi") || F.hasFnAttribute("ct-select") ||
(TT.isOSDarwin() &&
(M->getFunction("objc_retainAutoreleasedReturnValue") ||
- M->getFunction("objc_unsafeClaimAutoreleasedReturnValue")));
+ M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) ||
+ F.hasFnAttribute("ct-select");
}));
// Analyzes and emits pseudos to support Win x64 Unwind V2. This pass must run
diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll
new file mode 100644
index 0000000000000..0797265972a1f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll
@@ -0,0 +1,409 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32
+
+; Test ct.select edge cases and corner cases
+
+; Test with very large integers
+define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) {
+; X64-LABEL: test_ctselect_i128:
+; X64: # %bb.0:
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
+; X64-NEXT: cmovneq %rdx, %r8
+; X64-NEXT: movq %r8, %rdx
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_i128:
+; X32: # %bb.0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edi
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, 12(%eax)
+; X32-NEXT: movl %edx, 8(%eax)
+; X32-NEXT: movl %edi, 4(%eax)
+; X32-NEXT: movl %esi, (%eax)
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: retl $4
+ %result = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b)
+ ret i128 %result
+}
+
+; Test with small integer types
+define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+; X64-LABEL: test_ctselect_i1:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_i1:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: # kill: def $al killed $al killed $eax
+; X32-NEXT: retl
+ %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+ ret i1 %result
+}
+
+; Test with extremal values
+define i32 @test_ctselect_extremal_values(i1 %cond) {
+; X64-LABEL: test_ctselect_extremal_values:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_extremal_values:
+; X32: # %bb.0:
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X32-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: retl
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648)
+ ret i32 %result
+}
+
+; Test with floating point special values
+define float @test_ctselect_f32_special_values(i1 %cond) {
+; X64-LABEL: test_ctselect_f32_special_values:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT: movl $2139095040, %ecx # imm = 0x7F800000
+; X64-NEXT: cmovnel %eax, %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_f32_special_values:
+; X32: # %bb.0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: sete %al
+; X32-NEXT: movl {{\.?LCPI[0-9]+_[0-9]+}}, %ecx
+; X32-NEXT: movl {{\.?LCPI[0-9]+_[0-9]+}}, %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
+ ret float %result
+}
+
+define double @test_ctselect_f64_special_values(i1 %cond) {
+; X64-LABEL: test_ctselect_f64_special_values:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; X64-NEXT: movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+; X64-NEXT: cmovneq %rax, %rcx
+; X64-NEXT: movq %rcx, %xmm0
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_f64_special_values:
+; X32: # %bb.0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: subl $24, %esp
+; X32-NEXT: .cfi_def_cfa_offset 36
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X32-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X32-NEXT: sete %al
+; X32-NEXT: fxch %st(1)
+; X32-NEXT: fstpl {{[0-9]+}}(%esp)
+; X32-NEXT: fstpl (%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl (%esp), %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: fldl {{[0-9]+}}(%esp)
+; X32-NEXT: addl $24, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: retl
+ %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000)
+ ret double %result
+}
+
+; Test with null pointers
+define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+; X64-LABEL: test_ctselect_null_ptr:
+; X64: # %bb.0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_null_ptr:
+; X32: # %bb.0:
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+ ret ptr %result
+}
+
+; Test with function pointers
+define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+; X64-LABEL: test_ctselect_function_ptr:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_function_ptr:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+ ret ptr %result
+}
+
+; Test with volatile loads
+define i32 @test_ctselect_volatile_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_volatile_load:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rsi), %ecx
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_volatile_load:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl (%ecx), %ecx
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: retl
+ %a = load volatile i32, ptr %p1
+ %b = load volatile i32, ptr %p2
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test with atomic loads
+define i32 @test_ctselect_atomic_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_atomic_load:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rsi), %ecx
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_atomic_load:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl (%ecx), %ecx
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: retl
+ %a = load atomic i32, ptr %p1 acquire, align 4
+ %b = load atomic i32, ptr %p2 acquire, align 4
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test with condition from icmp on pointers
+define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_ptr_cmp:
+; X64: # %bb.0:
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: cmpq %rsi, %rdi
+; X64-NEXT: sete %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovneq %rdx, %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_ptr_cmp:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: sete %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+ %cmp = icmp eq ptr %p1, %p2
+ %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b)
+ ret ptr %result
+}
+
+; Test with struct pointer types (struct types themselves may not be directly supported)
+%struct.pair = type { i32, i32 }
+
+define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_struct_ptr:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_struct_ptr:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+ %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+ ret ptr %result
+}
+
+; Test with deeply nested conditions (stress test for instruction selection)
+define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+; X64-LABEL: test_ctselect_deeply_nested:
+; X64: # %bb.0:
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT: movl {{[0-9]+}}(%rsp), %r11d
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %r8d, %r9d
+; X64-NEXT: testb $1, %sil
+; X64-NEXT: cmovnel %r9d, %r11d
+; X64-NEXT: testb $1, %dl
+; X64-NEXT: cmovnel %r11d, %r10d
+; X64-NEXT: testb $1, %cl
+; X64-NEXT: cmovnel %r10d, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_deeply_nested:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_offset %esi, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %esi
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %esi, %edx
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %edx, %ecx
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %ecx, %eax
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: retl
+ %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+ %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+ %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+ %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e)
+ ret i32 %sel4
+}
+
+; Test with misaligned loads
+define i32 @test_ctselect_misaligned_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_misaligned_load:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel (%rsi), %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_ctselect_misaligned_load:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel (%ecx), %eax
+; X32-NEXT: retl
+ %a = load i32, ptr %p1, align 1
+ %b = load i32, ptr %p2, align 1
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
+declare i128 @llvm.ct.select.i128(i1, i128, i128)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
new file mode 100644
index 0000000000000..ea943307c644f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Comprehensive CTSELECT tests for i386 targets with floating-point types
+; - Without CMOV: constant-time implementation using FP->int conversion + existing post-RA CTSELECT
+; - With CMOV: CMOV-based implementation
+; - Verifies security properties: no conditional branches, constant execution time
+; Strategy: FP values stored to memory, converted to integers, CTSELECT on integers, converted back to FP
+
+; Test basic f32 functionality
+define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+ ret float %result
+}
+
+; Test f32 with different condition codes
+define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_eq:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fucompp
+; I386-NOCMOV-NEXT: fnstsw %ax
+; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax
+; I386-NOCMOV-NEXT: sahf
+; I386-NOCMOV-NEXT: setnp %al
+; I386-NOCMOV-NEXT: sete %cl
+; I386-NOCMOV-NEXT: testb %al, %cl
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_eq:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fucompi %st(1), %st
+; I386-CMOV-NEXT: fstp %st(0)
+; I386-CMOV-NEXT: setnp %al
+; I386-CMOV-NEXT: sete %cl
+; I386-CMOV-NEXT: testb %al, %cl
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %cmp = fcmp oeq float %x, %y
+ %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b)
+ ret float %result
+}
+
+; Test basic f64 functionality
+define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f64_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $8, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldl (%esp)
+; I386-NOCMOV-NEXT: addl $8, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f64_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $8, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldl (%esp)
+; I386-CMOV-NEXT: addl $8, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+ ret double %result
+}
+
+; Test basic x86_fp80 functionality
+define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f80_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $12, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt (%esp)
+; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f80_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldt (%esp)
+; I386-CMOV-NEXT: addl $12, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
+ ret x86_fp80 %result
+}
+
+; Test f32 with complex conditions
+define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_gt:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fucompp
+; I386-NOCMOV-NEXT: fnstsw %ax
+; I386-NOCMOV-NEXT: # kill: def $ah killed $ah killed $ax
+; I386-NOCMOV-NEXT: sahf
+; I386-NOCMOV-NEXT: seta %al
+; I386-NOCMOV-NEXT: testb %al, %al
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_gt:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fucompi %st(1), %st
+; I386-CMOV-NEXT: fstp %st(0)
+; I386-CMOV-NEXT: seta %al
+; I386-CMOV-NEXT: testb %al, %al
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %cmp = fcmp ogt float %x, %y
+ %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b)
+ ret float %result
+}
+
+; Test constant-time properties: verify no branches in generated code
+define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_no_branches:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_no_branches:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+ ret float %result
+}
+
+; Test that BUNDLE directives are present for constant-time guarantees
+define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_bundled:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: pushl %eax
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $4, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_bundled:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: pushl %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $4, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+ ret float %result
+}
+
+; Test edge case: NaN handling
+define float @test_ctselect_f32_nan(i1 %cond) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_nan:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $12, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; I386-NOCMOV-NEXT: fldz
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: fxch %st(1)
+; I386-NOCMOV-NEXT: fstps {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstps (%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl (%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_nan:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}}
+; I386-CMOV-NEXT: fldz
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: fxch %st(1)
+; I386-CMOV-NEXT: fstps {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstps (%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl (%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: addl $12, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %nan = bitcast i32 2139095040 to float ; 0x7F800000 = +inf
+ %zero = bitcast i32 0 to float
+ %result = call float @llvm.ct.select.f32(i1 %cond, float %nan, float %zero)
+ ret float %result
+}
+
+; Test memory alignment for f80
+define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f80_alignment:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $12, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt (%esp)
+; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f80_alignment:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldt (%esp)
+; I386-CMOV-NEXT: addl $12, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
+ ret x86_fp80 %result
+}
+
+; Stress test: multiple CTSELECT operations
+define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float %b, float %c, float %d) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_multiple:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: subl $8, %esp
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %ecx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, (%esp)
+; I386-NOCMOV-NEXT: flds (%esp)
+; I386-NOCMOV-NEXT: addl $8, %esp
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_multiple:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %edi
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: subl $8, %esp
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movb %al, %ah
+; I386-CMOV-NEXT: movzbl %ah, %edi
+; I386-CMOV-NEXT: negl %edi
+; I386-CMOV-NEXT: movl %edx, %esi
+; I386-CMOV-NEXT: andl %edi, %esi
+; I386-CMOV-NEXT: notl %edi
+; I386-CMOV-NEXT: andl %ecx, %edi
+; I386-CMOV-NEXT: orl %edi, %esi
+; I386-CMOV-NEXT: movl %esi, (%esp)
+; I386-CMOV-NEXT: flds (%esp)
+; I386-CMOV-NEXT: addl $8, %esp
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: retl
+ %sel1 = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b)
+ %sel2 = call float @llvm.ct.select.f32(i1 %cond2, float %sel1, float %c)
+ ret float %sel2
+}
+
+; Declare intrinsics
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare x86_fp80 @llvm.ct.select.f80(i1, x86_fp80, x86_fp80)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
new file mode 100644
index 0000000000000..bc7980c357e0e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
@@ -0,0 +1,428 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx < %s | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+cmov,+mmx < %s | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx -verify-machineinstrs < %s | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Test constant-time selection with MMX intrinsics to exercise VR64 CTSELECT
+; These tests use MMX intrinsics to create <1 x i64> values that get allocated to VR64 registers
+
+; Test MMX ct.select using paddd intrinsic to force VR64 allocation
+define <1 x i64> @test_mmx_ctselect_with_paddd(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_paddd:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: subl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT: .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT: .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %bl
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: sete %bh
+; I386-NOCMOV-NEXT: movb %bh, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %esi, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %ecx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %ecx
+; I386-NOCMOV-NEXT: andl %esi, %ecx
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: orl %esi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT: paddd %mm0, %mm0
+; I386-NOCMOV-NEXT: movq %mm0, (%esp)
+; I386-NOCMOV-NEXT: movl (%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: addl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: popl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_paddd:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: subl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %dl
+; I386-CMOV-NEXT: testb %dl, %dl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT: paddd %mm0, %mm0
+; I386-CMOV-NEXT: movq %mm0, (%esp)
+; I386-CMOV-NEXT: movl (%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: addl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT: retl
+ %mmx_a = bitcast i64 %a to <1 x i64>
+ %mmx_b = bitcast i64 %b to <1 x i64>
+ %cmp = icmp ne i32 %cond, 0
+ %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+ %result = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %sel, <1 x i64> %sel)
+ ret <1 x i64> %result
+}
+
+; Test MMX ct.select using psllw intrinsic
+define <1 x i64> @test_mmx_ctselect_with_psllw(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_psllw:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: subl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT: .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT: .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %bl
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: sete %bh
+; I386-NOCMOV-NEXT: movb %bh, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %esi, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %ecx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %ecx
+; I386-NOCMOV-NEXT: andl %esi, %ecx
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: orl %esi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT: psllw %mm0, %mm0
+; I386-NOCMOV-NEXT: movq %mm0, (%esp)
+; I386-NOCMOV-NEXT: movl (%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: addl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: popl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_psllw:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: subl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %dl
+; I386-CMOV-NEXT: testb %dl, %dl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT: psllw %mm0, %mm0
+; I386-CMOV-NEXT: movq %mm0, (%esp)
+; I386-CMOV-NEXT: movl (%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: addl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT: retl
+ %mmx_a = bitcast i64 %a to <1 x i64>
+ %mmx_b = bitcast i64 %b to <1 x i64>
+ %cmp = icmp ne i32 %cond, 0
+ %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+ %result = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %sel, <1 x i64> %sel)
+ ret <1 x i64> %result
+}
+
+; Test nested MMX ct.selects with pand intrinsic
+define <1 x i64> @test_mmx_nested_ctselect_with_pand(i32 %cond1, i32 %cond2, i64 %a, i64 %b, i64 %c) {
+; I386-NOCMOV-LABEL: test_mmx_nested_ctselect_with_pand:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: subl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT: .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT: .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %bl
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: sete %bh
+; I386-NOCMOV-NEXT: movb %bh, %cl
+; I386-NOCMOV-NEXT: movzbl %cl, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edx, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %eax, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: sete %dl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movb %dl, %dh
+; I386-NOCMOV-NEXT: movzbl %dh, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %esi, %ebx
+; I386-NOCMOV-NEXT: andl %ebp, %ebx
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %eax, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %ebx
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %dl
+; I386-NOCMOV-NEXT: testb %dl, %dl
+; I386-NOCMOV-NEXT: sete %dh
+; I386-NOCMOV-NEXT: movb %dh, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %ecx, %esi
+; I386-NOCMOV-NEXT: andl %ebp, %esi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %ebx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %esi
+; I386-NOCMOV-NEXT: testb %dl, %dl
+; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; I386-NOCMOV-NEXT: movb %al, %dl
+; I386-NOCMOV-NEXT: movzbl %dl, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %ebx, %ecx
+; I386-NOCMOV-NEXT: andl %esi, %ecx
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: orl %esi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT: pand %mm0, %mm0
+; I386-NOCMOV-NEXT: movq %mm0, (%esp)
+; I386-NOCMOV-NEXT: movl (%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: addl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: popl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_mmx_nested_ctselect_with_pand:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: pushl %ebx
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-CMOV-NEXT: pushl %esi
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-CMOV-NEXT: subl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 32
+; I386-CMOV-NEXT: .cfi_offset %esi, -12
+; I386-CMOV-NEXT: .cfi_offset %ebx, -8
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %bl
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %bh
+; I386-CMOV-NEXT: testb %bh, %bh
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %esi
+; I386-CMOV-NEXT: testb %bl, %bl
+; I386-CMOV-NEXT: cmovnel %esi, %edx
+; I386-CMOV-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel %ecx, %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT: pand %mm0, %mm0
+; I386-CMOV-NEXT: movq %mm0, (%esp)
+; I386-CMOV-NEXT: movl (%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: addl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-CMOV-NEXT: popl %esi
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-CMOV-NEXT: popl %ebx
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT: retl
+ %mmx_a = bitcast i64 %a to <1 x i64>
+ %mmx_b = bitcast i64 %b to <1 x i64>
+ %mmx_c = bitcast i64 %c to <1 x i64>
+ %cmp1 = icmp ne i32 %cond1, 0
+ %cmp2 = icmp ne i32 %cond2, 0
+ %sel1 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp2, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+ %sel2 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp1, <1 x i64> %sel1, <1 x i64> %mmx_c)
+ %result = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %sel2, <1 x i64> %sel2)
+ ret <1 x i64> %result
+}
+
+; Test MMX ct.select with por intrinsic
+define <1 x i64> @test_mmx_ctselect_with_por(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_por:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: subl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT: .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT: .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %bl
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: sete %bh
+; I386-NOCMOV-NEXT: movb %bh, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %esi, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %ecx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb %bl, %bl
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: movb %al, %ah
+; I386-NOCMOV-NEXT: movzbl %ah, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %ecx
+; I386-NOCMOV-NEXT: andl %esi, %ecx
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: orl %esi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT: por %mm0, %mm0
+; I386-NOCMOV-NEXT: movq %mm0, (%esp)
+; I386-NOCMOV-NEXT: movl (%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: addl $20, %esp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT: popl %ebp
+; I386-NOCMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_por:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: subl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %dl
+; I386-CMOV-NEXT: testb %dl, %dl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT: por %mm0, %mm0
+; I386-CMOV-NEXT: movq %mm0, (%esp)
+; I386-CMOV-NEXT: movl (%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT: addl $20, %esp
+; I386-CMOV-NEXT: .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT: retl
+ %mmx_a = bitcast i64 %a to <1 x i64>
+ %mmx_b = bitcast i64 %b to <1 x i64>
+ %cmp = icmp ne i32 %cond, 0
+ %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+ %result = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %sel, <1 x i64> %sel)
+ ret <1 x i64> %result
+}
+
+; Declare MMX intrinsics
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>)
+
+; Declare constant-time selection intrinsic
+declare <1 x i64> @llvm.ct.select.v1i64(i1, <1 x i64>, <1 x i64>)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386.ll b/llvm/test/CodeGen/X86/ctselect-i386.ll
new file mode 100644
index 0000000000000..d7345f1121540
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Comprehensive CTSELECT tests for i386 targets with scalar integer types
+; - Without CMOV: constant-time implementation using post-RA expansion with bundled instructions
+; - With CMOV: CMOV-based implementation
+; - Verifies security properties: no conditional branches, constant execution time
+; All expansion happens post-RA for better optimization control and constant-time guarantees
+
+; Test basic i32 functionality
+define i32 @test_ctselect_i32_basic(i1 %cond, i32 %a, i32 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i32_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %eax
+; I386-NOCMOV-NEXT: andl %esi, %eax
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %ecx, %esi
+; I386-NOCMOV-NEXT: orl %esi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_i32_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: retl
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test i16 functionality
+define i16 @test_ctselect_i16_basic(i1 %cond, i16 %a, i16 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i16_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbw %bh, %si
+; I386-NOCMOV-NEXT: negw %si
+; I386-NOCMOV-NEXT: movw %dx, %ax
+; I386-NOCMOV-NEXT: andw %si, %ax
+; I386-NOCMOV-NEXT: notw %si
+; I386-NOCMOV-NEXT: andw %cx, %si
+; I386-NOCMOV-NEXT: orw %si, %ax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_i16_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnew {{[0-9]+}}(%esp), %ax
+; I386-CMOV-NEXT: retl
+ %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+ ret i16 %result
+}
+
+; Test i8 functionality
+define i8 @test_ctselect_i8_basic(i1 %cond, i8 %a, i8 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i8_basic:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %ah
+; I386-NOCMOV-NEXT: movb %ah, %ch
+; I386-NOCMOV-NEXT: negb %ch
+; I386-NOCMOV-NEXT: movb %dl, %al
+; I386-NOCMOV-NEXT: andb %ch, %al
+; I386-NOCMOV-NEXT: notb %ch
+; I386-NOCMOV-NEXT: andb %cl, %ch
+; I386-NOCMOV-NEXT: orb %ch, %al
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_i8_basic:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: # kill: def $al killed $al killed $eax
+; I386-CMOV-NEXT: retl
+ %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+ ret i8 %result
+}
+
+; Test security property: constant-time execution for cryptographic use case
+define i32 @test_crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwind {
+; I386-NOCMOV-LABEL: test_crypto_key_select:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %al
+; I386-NOCMOV-NEXT: testb %al, %al
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %eax
+; I386-NOCMOV-NEXT: andl %esi, %eax
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %ecx, %esi
+; I386-NOCMOV-NEXT: orl %esi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_crypto_key_select:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %cl
+; I386-CMOV-NEXT: testb %cl, %cl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: retl
+ %cond = icmp ne i32 %secret_bit, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %key1, i32 %key2)
+ ret i32 %result
+}
+
+; Test that no conditional branches appear in constant-time path
+define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) nounwind {
+; I386-NOCMOV-LABEL: test_no_conditional_branches:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: setne %al
+; I386-NOCMOV-NEXT: testb %al, %al
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %eax
+; I386-NOCMOV-NEXT: andl %esi, %eax
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %ecx, %esi
+; I386-NOCMOV-NEXT: orl %esi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_no_conditional_branches:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: setne %cl
+; I386-CMOV-NEXT: testb %cl, %cl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: retl
+ %cond = icmp ne i32 %secret, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2)
+ ret i32 %result
+}
+
+; Test with comparison condition
+define i32 @test_ctselect_i32_cmp(i32 %a, i32 %b, i32 %c) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i32_cmp:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: cmpl %edx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: testb %al, %al
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %esi
+; I386-NOCMOV-NEXT: negl %esi
+; I386-NOCMOV-NEXT: movl %edx, %eax
+; I386-NOCMOV-NEXT: andl %esi, %eax
+; I386-NOCMOV-NEXT: notl %esi
+; I386-NOCMOV-NEXT: andl %ecx, %esi
+; I386-NOCMOV-NEXT: orl %esi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_i32_cmp:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: sete %cl
+; I386-CMOV-NEXT: testb %cl, %cl
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: retl
+ %cond = icmp eq i32 %a, %c
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c)
+ ret i32 %result
+}
+
+; Test nested selects
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_nested:
+; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebx
+; I386-NOCMOV-NEXT: pushl %edi
+; I386-NOCMOV-NEXT: pushl %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %bl
+; I386-NOCMOV-NEXT: movb %bl, %bh
+; I386-NOCMOV-NEXT: movzbl %bh, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %edx, %esi
+; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %eax, %edi
+; I386-NOCMOV-NEXT: orl %edi, %esi
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %dl
+; I386-NOCMOV-NEXT: movb %dl, %dh
+; I386-NOCMOV-NEXT: movzbl %dh, %edi
+; I386-NOCMOV-NEXT: negl %edi
+; I386-NOCMOV-NEXT: movl %ecx, %eax
+; I386-NOCMOV-NEXT: andl %edi, %eax
+; I386-NOCMOV-NEXT: notl %edi
+; I386-NOCMOV-NEXT: andl %esi, %edi
+; I386-NOCMOV-NEXT: orl %edi, %eax
+; I386-NOCMOV-NEXT: popl %esi
+; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: retl
+;
+; I386-CMOV-LABEL: test_ctselect_nested:
+; I386-CMOV: # %bb.0:
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel %ecx, %eax
+; I386-CMOV-NEXT: retl
+ %sel1 = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+ %sel2 = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %sel1, i32 %c)
+ ret i32 %sel2
+}
+
+; Declare ct.select intrinsics
+declare i8 @llvm.ct.select.i8(i1, i8, i8)
+declare i16 @llvm.ct.select.i16(i1, i16, i16)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
diff --git a/llvm/test/CodeGen/X86/ctselect-optimization.ll b/llvm/test/CodeGen/X86/ctselect-optimization.ll
new file mode 100644
index 0000000000000..481d49971a937
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-optimization.ll
@@ -0,0 +1,304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s
+
+; Test ct.select optimization patterns
+
+; Test smin(x, 0) pattern optimization
+define i32 @test_ctselect_smin_zero(i32 %x) {
+; CHECK-LABEL: test_ctselect_smin_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+ ret i32 %result
+}
+
+; Test smax(x, 0) pattern optimization
+define i32 @test_ctselect_smax_zero(i32 %x) {
+; CHECK-LABEL: test_ctselect_smax_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: setg %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp sgt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+ ret i32 %result
+}
+
+; Test generic smin pattern
+define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_smin_generic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setl %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+}
+
+; Test generic smax pattern
+define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_smax_generic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setg %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp sgt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+}
+
+; Test umin pattern
+define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_umin_generic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setb %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ult i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+}
+
+; Test umax pattern
+define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_umax_generic:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: seta %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ugt i32 %x, %y
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+ ret i32 %result
+}
+
+; Test abs pattern
+define i32 @test_ctselect_abs(i32 %x) {
+; CHECK-LABEL: test_ctselect_abs:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: negl %ecx
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %dl
+; CHECK-NEXT: testb %dl, %dl
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %neg = sub i32 0, %x
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x)
+ ret i32 %result
+}
+
+; Test nabs pattern (negative abs)
+define i32 @test_ctselect_nabs(i32 %x) {
+; CHECK-LABEL: test_ctselect_nabs:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: negl %eax
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %neg = sub i32 0, %x
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg)
+ ret i32 %result
+}
+
+; Test sign extension pattern
+define i32 @test_ctselect_sign_extend(i32 %x) {
+; CHECK-LABEL: test_ctselect_sign_extend:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: movl $-1, %ecx
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+ ret i32 %result
+}
+
+; Test zero extension pattern
+define i32 @test_ctselect_zero_extend(i32 %x) {
+; CHECK-LABEL: test_ctselect_zero_extend:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: movl $1, %ecx
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ne i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0)
+ ret i32 %result
+}
+
+; Test mask generation pattern
+define i32 @test_ctselect_mask_generation(i32 %x) {
+; CHECK-LABEL: test_ctselect_mask_generation:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: movl $-1, %ecx
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i32 %x, 0
+ %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+ ret i32 %result
+}
+
+; Test constant folding with known condition
+define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_constant_folding_true:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movb $1, %cl
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_constant_folding_false:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %edi, %eax
+; CHECK-NEXT: retq
+ %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test with identical operands
+define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+; CHECK-LABEL: test_ctselect_identical_operands:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: cmovnel %esi, %eax
+; CHECK-NEXT: retq
+ %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x)
+ ret i32 %result
+}
+
+; Test with inverted condition
+define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_inverted_condition:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: sete %dl
+; CHECK-NEXT: testb %dl, %dl
+; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp eq i32 %x, %y
+ %not_cmp = xor i1 %cmp, true
+ %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b)
+ ret i32 %result
+}
+
+; Test for 64-bit specific optimizations
+define i64 @test_ctselect_i64_smin_zero(i64 %x) {
+; CHECK-LABEL: test_ctselect_i64_smin_zero:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testq %rdi, %rdi
+; CHECK-NEXT: sets %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovneq %rdi, %rax
+; CHECK-NEXT: retq
+ %cmp = icmp slt i64 %x, 0
+ %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0)
+ ret i64 %result
+}
+
+; Test for floating point optimizations
+define float @test_ctselect_f32_zero_positive(float %x) {
+; CHECK-LABEL: test_ctselect_f32_zero_positive:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
+; CHECK-NEXT: seta %cl
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovnel %eax, %edx
+; CHECK-NEXT: movd %edx, %xmm0
+; CHECK-NEXT: retq
+ %cmp = fcmp ogt float %x, 0.0
+ %result = call float @llvm.ct.select.f32(i1 %cmp, float %x, float 0.0)
+ ret float %result
+}
+
+define double @test_ctselect_f64_zero_positive(double %x) {
+; CHECK-LABEL: test_ctselect_f64_zero_positive:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %xmm0, %rax
+; CHECK-NEXT: xorpd %xmm1, %xmm1
+; CHECK-NEXT: ucomisd %xmm1, %xmm0
+; CHECK-NEXT: seta %cl
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testb %cl, %cl
+; CHECK-NEXT: cmovneq %rax, %rdx
+; CHECK-NEXT: movq %rdx, %xmm0
+; CHECK-NEXT: retq
+ %cmp = fcmp ogt double %x, 0.0
+ %result = call double @llvm.ct.select.f64(i1 %cmp, double %x, double 0.0)
+ ret double %result
+}
+
+; Test chain of ct.select operations
+define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: test_ctselect_chain:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: testb $1, %dil
+; CHECK-NEXT: cmovnel %ecx, %r8d
+; CHECK-NEXT: testb $1, %sil
+; CHECK-NEXT: cmovnel %r8d, %r9d
+; CHECK-NEXT: testb $1, %dl
+; CHECK-NEXT: cmovnel %r9d, %eax
+; CHECK-NEXT: retq
+ %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+ %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+ %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+ ret i32 %sel3
+}
+
+; Declare the intrinsics
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll
new file mode 100644
index 0000000000000..2206e32cd6d34
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-vector.ll
@@ -0,0 +1,1274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+
+; Test ct.select functionality for vector types
+
+; 128-bit vectors
+define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB0_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %xmm0, %xmm1
+; AVX512-NEXT: .LBB0_2:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %result
+}
+
+define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: test_ctselect_v4f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movaps %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB1_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %xmm0, %xmm1
+; AVX512-NEXT: .LBB1_2:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+ ret <4 x float> %result
+}
+
+define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v2i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v2i64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB2_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %xmm0, %xmm1
+; AVX512-NEXT: .LBB2_2:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b)
+ ret <2 x i64> %result
+}
+
+define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
+; SSE2-LABEL: test_ctselect_v2f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movapd %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB3_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovapd %xmm0, %xmm1
+; AVX512-NEXT: .LBB3_2:
+; AVX512-NEXT: vmovapd %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b)
+ ret <2 x double> %result
+}
+
+; 256-bit vectors
+define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v8i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v8i32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm3, %ymm3
+; AVX-NEXT: vmovd %eax, %ymm3
+; AVX-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm3, %ymm2
+; AVX-NEXT: pand %ymm0, %ymm3
+; AVX-NEXT: pandn %ymm1, %ymm2
+; AVX-NEXT: por %ymm3, %ymm2
+; AVX-NEXT: vmovaps %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm3, %ymm3
+; AVX2-NEXT: vmovd %eax, %ymm3
+; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm3, %ymm2
+; AVX2-NEXT: pand %ymm0, %ymm3
+; AVX2-NEXT: pandn %ymm1, %ymm2
+; AVX2-NEXT: por %ymm3, %ymm2
+; AVX2-NEXT: vmovaps %ymm2, %ymm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB4_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %ymm0, %ymm1
+; AVX512-NEXT: .LBB4_2:
+; AVX512-NEXT: vmovaps %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b)
+ ret <8 x i32> %result
+}
+
+define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: test_ctselect_v8f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: movaps %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v8f32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm3, %ymm3
+; AVX-NEXT: vmovd %eax, %ymm3
+; AVX-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm3, %ymm2
+; AVX-NEXT: pand %ymm0, %ymm3
+; AVX-NEXT: pandn %ymm1, %ymm2
+; AVX-NEXT: por %ymm3, %ymm2
+; AVX-NEXT: vmovaps %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v8f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm3, %ymm3
+; AVX2-NEXT: vmovd %eax, %ymm3
+; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm3, %ymm2
+; AVX2-NEXT: pand %ymm0, %ymm3
+; AVX2-NEXT: pandn %ymm1, %ymm2
+; AVX2-NEXT: por %ymm3, %ymm2
+; AVX2-NEXT: vmovaps %ymm2, %ymm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB5_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %ymm0, %ymm1
+; AVX512-NEXT: .LBB5_2:
+; AVX512-NEXT: vmovaps %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %result = call <8 x float> @llvm.ct.select.v8f32(i1 %cond, <8 x float> %a, <8 x float> %b)
+ ret <8 x float> %result
+}
+
+define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v4i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm3, %ymm3
+; AVX-NEXT: vmovd %eax, %ymm3
+; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm3, %ymm2
+; AVX-NEXT: pand %ymm0, %ymm3
+; AVX-NEXT: pandn %ymm1, %ymm2
+; AVX-NEXT: por %ymm3, %ymm2
+; AVX-NEXT: vmovaps %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm3, %ymm3
+; AVX2-NEXT: vmovd %eax, %ymm3
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm3, %ymm2
+; AVX2-NEXT: pand %ymm0, %ymm3
+; AVX2-NEXT: pandn %ymm1, %ymm2
+; AVX2-NEXT: por %ymm3, %ymm2
+; AVX2-NEXT: vmovaps %ymm2, %ymm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB6_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %ymm0, %ymm1
+; AVX512-NEXT: .LBB6_2:
+; AVX512-NEXT: vmovaps %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %result = call <4 x i64> @llvm.ct.select.v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b)
+ ret <4 x i64> %result
+}
+
+define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: test_ctselect_v4f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT: movapd %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movapd %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4f64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm3, %ymm3
+; AVX-NEXT: vmovd %eax, %ymm3
+; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm3, %ymm2
+; AVX-NEXT: pand %ymm0, %ymm3
+; AVX-NEXT: pandn %ymm1, %ymm2
+; AVX-NEXT: por %ymm3, %ymm2
+; AVX-NEXT: vmovaps %ymm2, %ymm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm3, %ymm3
+; AVX2-NEXT: vmovd %eax, %ymm3
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm3, %ymm2
+; AVX2-NEXT: pand %ymm0, %ymm3
+; AVX2-NEXT: pandn %ymm1, %ymm2
+; AVX2-NEXT: por %ymm3, %ymm2
+; AVX2-NEXT: vmovaps %ymm2, %ymm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB7_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovapd %ymm0, %ymm1
+; AVX512-NEXT: .LBB7_2:
+; AVX512-NEXT: vmovapd %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %result = call <4 x double> @llvm.ct.select.v4f64(i1 %cond, <4 x double> %a, <4 x double> %b)
+ ret <4 x double> %result
+}
+
+; 512-bit vectors (AVX512 only)
+define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v16i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm9, %xmm8
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm8, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm2
+; SSE2-NEXT: movaps %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v16i32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm5, %ymm5
+; AVX-NEXT: vmovd %eax, %ymm5
+; AVX-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm5, %ymm4
+; AVX-NEXT: pand %ymm0, %ymm5
+; AVX-NEXT: pandn %ymm2, %ymm4
+; AVX-NEXT: por %ymm5, %ymm4
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm0, %ymm0
+; AVX-NEXT: vmovd %eax, %ymm0
+; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm0, %ymm2
+; AVX-NEXT: pand %ymm1, %ymm0
+; AVX-NEXT: pandn %ymm3, %ymm2
+; AVX-NEXT: por %ymm0, %ymm2
+; AVX-NEXT: vmovaps %ymm4, %ymm0
+; AVX-NEXT: vmovaps %ymm2, %ymm1
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v16i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm5, %ymm5
+; AVX2-NEXT: vmovd %eax, %ymm5
+; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: pand %ymm0, %ymm5
+; AVX2-NEXT: pandn %ymm2, %ymm4
+; AVX2-NEXT: por %ymm5, %ymm4
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm0, %ymm0
+; AVX2-NEXT: vmovd %eax, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-NEXT: pand %ymm1, %ymm0
+; AVX2-NEXT: pandn %ymm3, %ymm2
+; AVX2-NEXT: por %ymm0, %ymm2
+; AVX2-NEXT: vmovaps %ymm4, %ymm0
+; AVX2-NEXT: vmovaps %ymm2, %ymm1
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v16i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB8_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %zmm0, %zmm1
+; AVX512-NEXT: .LBB8_2:
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %result = call <16 x i32> @llvm.ct.select.v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b)
+ ret <16 x i32> %result
+}
+
+define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) {
+; SSE2-LABEL: test_ctselect_v16f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT: movaps %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm9, %xmm8
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movaps %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm8, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm2
+; SSE2-NEXT: movaps %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v16f32:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm5, %ymm5
+; AVX-NEXT: vmovd %eax, %ymm5
+; AVX-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm5, %ymm4
+; AVX-NEXT: pand %ymm0, %ymm5
+; AVX-NEXT: pandn %ymm2, %ymm4
+; AVX-NEXT: por %ymm5, %ymm4
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm0, %ymm0
+; AVX-NEXT: vmovd %eax, %ymm0
+; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX-NEXT: vmovdqa %ymm0, %ymm2
+; AVX-NEXT: pand %ymm1, %ymm0
+; AVX-NEXT: pandn %ymm3, %ymm2
+; AVX-NEXT: por %ymm0, %ymm2
+; AVX-NEXT: vmovaps %ymm4, %ymm0
+; AVX-NEXT: vmovaps %ymm2, %ymm1
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v16f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm5, %ymm5
+; AVX2-NEXT: vmovd %eax, %ymm5
+; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: pand %ymm0, %ymm5
+; AVX2-NEXT: pandn %ymm2, %ymm4
+; AVX2-NEXT: por %ymm5, %ymm4
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm0, %ymm0
+; AVX2-NEXT: vmovd %eax, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-NEXT: pand %ymm1, %ymm0
+; AVX2-NEXT: pandn %ymm3, %ymm2
+; AVX2-NEXT: por %ymm0, %ymm2
+; AVX2-NEXT: vmovaps %ymm4, %ymm0
+; AVX2-NEXT: vmovaps %ymm2, %ymm1
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v16f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB9_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %zmm0, %zmm1
+; AVX512-NEXT: .LBB9_2:
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %result = call <16 x float> @llvm.ct.select.v16f32(i1 %cond, <16 x float> %a, <16 x float> %b)
+ ret <16 x float> %result
+}
+
+define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v8i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm9, %xmm8
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm8, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm2
+; SSE2-NEXT: movaps %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v8i64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm5, %ymm5
+; AVX-NEXT: vmovd %eax, %ymm5
+; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm5, %ymm4
+; AVX-NEXT: pand %ymm0, %ymm5
+; AVX-NEXT: pandn %ymm2, %ymm4
+; AVX-NEXT: por %ymm5, %ymm4
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm0, %ymm0
+; AVX-NEXT: vmovd %eax, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm0, %ymm2
+; AVX-NEXT: pand %ymm1, %ymm0
+; AVX-NEXT: pandn %ymm3, %ymm2
+; AVX-NEXT: por %ymm0, %ymm2
+; AVX-NEXT: vmovaps %ymm4, %ymm0
+; AVX-NEXT: vmovaps %ymm2, %ymm1
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v8i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm5, %ymm5
+; AVX2-NEXT: vmovd %eax, %ymm5
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: pand %ymm0, %ymm5
+; AVX2-NEXT: pandn %ymm2, %ymm4
+; AVX2-NEXT: por %ymm5, %ymm4
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm0, %ymm0
+; AVX2-NEXT: vmovd %eax, %ymm0
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-NEXT: pand %ymm1, %ymm0
+; AVX2-NEXT: pandn %ymm3, %ymm2
+; AVX2-NEXT: por %ymm0, %ymm2
+; AVX2-NEXT: vmovaps %ymm4, %ymm0
+; AVX2-NEXT: vmovaps %ymm2, %ymm1
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v8i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB10_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %zmm0, %zmm1
+; AVX512-NEXT: .LBB10_2:
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %result = call <8 x i64> @llvm.ct.select.v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b)
+ ret <8 x i64> %result
+}
+
+define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) {
+; SSE2-LABEL: test_ctselect_v8f64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: testb $1, %dil
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movd %eax, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT: movapd %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm4, %xmm8
+; SSE2-NEXT: por %xmm9, %xmm8
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movapd %xmm0, %xmm4
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movapd %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movapd %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm6
+; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movaps %xmm8, %xmm0
+; SSE2-NEXT: movaps %xmm4, %xmm1
+; SSE2-NEXT: movaps %xmm5, %xmm2
+; SSE2-NEXT: movaps %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v8f64:
+; AVX: # %bb.0:
+; AVX-NEXT: testb $1, %dil
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm5, %ymm5
+; AVX-NEXT: vmovd %eax, %ymm5
+; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm5, %ymm4
+; AVX-NEXT: pand %ymm0, %ymm5
+; AVX-NEXT: pandn %ymm2, %ymm4
+; AVX-NEXT: por %ymm5, %ymm4
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %ymm0, %ymm0
+; AVX-NEXT: vmovd %eax, %ymm0
+; AVX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT: vmovdqa %ymm0, %ymm2
+; AVX-NEXT: pand %ymm1, %ymm0
+; AVX-NEXT: pandn %ymm3, %ymm2
+; AVX-NEXT: por %ymm0, %ymm2
+; AVX-NEXT: vmovaps %ymm4, %ymm0
+; AVX-NEXT: vmovaps %ymm2, %ymm1
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v8f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: testb $1, %dil
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm5, %ymm5
+; AVX2-NEXT: vmovd %eax, %ymm5
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: pand %ymm0, %ymm5
+; AVX2-NEXT: pandn %ymm2, %ymm4
+; AVX2-NEXT: por %ymm5, %ymm4
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %ymm0, %ymm0
+; AVX2-NEXT: vmovd %eax, %ymm0
+; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2-NEXT: vmovdqa %ymm0, %ymm2
+; AVX2-NEXT: pand %ymm1, %ymm0
+; AVX2-NEXT: pandn %ymm3, %ymm2
+; AVX2-NEXT: por %ymm0, %ymm2
+; AVX2-NEXT: vmovaps %ymm4, %ymm0
+; AVX2-NEXT: vmovaps %ymm2, %ymm1
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v8f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: testb %dil, %dil
+; AVX512-NEXT: je .LBB11_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovapd %zmm0, %zmm1
+; AVX512-NEXT: .LBB11_2:
+; AVX512-NEXT: vmovapd %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %result = call <8 x double> @llvm.ct.select.v8f64(i1 %cond, <8 x double> %a, <8 x double> %b)
+ ret <8 x double> %result
+}
+
+; Test with constant conditions for vector types
+define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_const_true:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movb $1, %al
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i32_const_true:
+; AVX: # %bb.0:
+; AVX-NEXT: movb $1, %al
+; AVX-NEXT: testb %al, %al
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_const_true:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movb $1, %al
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i32_const_true:
+; AVX512: # %bb.0:
+; AVX512-NEXT: retq
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_const_false:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i32_const_false:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: testb %al, %al
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_const_false:
+; AVX2: # %bb.0:
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i32_const_false:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 false, <4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %result
+}
+
+; Test with comparison conditions for vector types
+define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_icmp:
+; SSE2: # %bb.0:
+; SSE2-NEXT: cmpl %esi, %edi
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: testb %al, %al
+; SSE2-NEXT: movl $0, %eax
+; SSE2-NEXT: setne %al
+; SSE2-NEXT: movzbl %al, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_ctselect_v4i32_icmp:
+; AVX: # %bb.0:
+; AVX-NEXT: cmpl %esi, %edi
+; AVX-NEXT: sete %al
+; AVX-NEXT: testb %al, %al
+; AVX-NEXT: movl $0, %eax
+; AVX-NEXT: setne %al
+; AVX-NEXT: movzbl %al, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: pxor %xmm3, %xmm3
+; AVX-NEXT: movd %eax, %xmm3
+; AVX-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT: movdqa %xmm3, %xmm2
+; AVX-NEXT: pand %xmm0, %xmm3
+; AVX-NEXT: pandn %xmm1, %xmm2
+; AVX-NEXT: por %xmm3, %xmm2
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_icmp:
+; AVX2: # %bb.0:
+; AVX2-NEXT: cmpl %esi, %edi
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: movl $0, %eax
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: movzbl %al, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: pxor %xmm3, %xmm3
+; AVX2-NEXT: movd %eax, %xmm3
+; AVX2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT: movdqa %xmm3, %xmm2
+; AVX2-NEXT: pand %xmm0, %xmm3
+; AVX2-NEXT: pandn %xmm1, %xmm2
+; AVX2-NEXT: por %xmm3, %xmm2
+; AVX2-NEXT: vmovaps %xmm2, %xmm0
+; AVX2-NEXT: retq
+; AVX512-LABEL: test_ctselect_v4i32_icmp:
+; AVX512: # %bb.0:
+; AVX512-NEXT: cmpl %esi, %edi
+; AVX512-NEXT: je .LBB14_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: .LBB14_2:
+; AVX512-NEXT: retq
+ %cond = icmp eq i32 %x, %y
+ %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+ ret <4 x i32> %result
+}
+
+; Declare the intrinsics
+declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
+declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
+declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
+declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
+declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.ct.select.v8f32(i1, <8 x float>, <8 x float>)
+declare <4 x i64> @llvm.ct.select.v4i64(i1, <4 x i64>, <4 x i64>)
+declare <4 x double> @llvm.ct.select.v4f64(i1, <4 x double>, <4 x double>)
+declare <16 x i32> @llvm.ct.select.v16i32(i1, <16 x i32>, <16 x i32>)
+declare <16 x float> @llvm.ct.select.v16f32(i1, <16 x float>, <16 x float>)
+declare <8 x i64> @llvm.ct.select.v8i64(i1, <8 x i64>, <8 x i64>)
+declare <8 x double> @llvm.ct.select.v8f64(i1, <8 x double>, <8 x double>)
diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
index 095787a5e2a4b..d76ae0365f28c 100644
--- a/llvm/test/CodeGen/X86/ctselect.ll
+++ b/llvm/test/CodeGen/X86/ctselect.ll
@@ -8,39 +8,33 @@
define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
; X64-LABEL: test_ctselect_i8:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andb $1, %dil
-; X64-NEXT: leal -1(%rdi), %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: negb %cl
-; X64-NEXT: andb %sil, %cl
-; X64-NEXT: andb %dl, %al
-; X64-NEXT: orb %cl, %al
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_i8:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andb $1, %al
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negb %cl
-; X32-NEXT: andb {{[0-9]+}}(%esp), %cl
-; X32-NEXT: decb %al
-; X32-NEXT: andb {{[0-9]+}}(%esp), %al
-; X32-NEXT: orb %cl, %al
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: # kill: def $al killed $al killed $eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_i8:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andb $1, %al
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negb %cl
-; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %cl
-; X32-NOCMOV-NEXT: decb %al
-; X32-NOCMOV-NEXT: andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT: orb %cl, %al
+; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %ah
+; X32-NOCMOV-NEXT: movb %ah, %ch
+; X32-NOCMOV-NEXT: negb %ch
+; X32-NOCMOV-NEXT: movb %dl, %al
+; X32-NOCMOV-NEXT: andb %ch, %al
+; X32-NOCMOV-NEXT: notb %ch
+; X32-NOCMOV-NEXT: andb %cl, %ch
+; X32-NOCMOV-NEXT: orb %ch, %al
; X32-NOCMOV-NEXT: retl
%result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
ret i8 %result
@@ -49,39 +43,43 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
; X64-LABEL: test_ctselect_i16:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: leal -1(%rdi), %ecx
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: negl %eax
-; X64-NEXT: andl %esi, %eax
-; X64-NEXT: andl %edx, %ecx
-; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_i16:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: leal -1(%eax), %ecx
-; X32-NEXT: andw {{[0-9]+}}(%esp), %cx
-; X32-NEXT: negl %eax
-; X32-NEXT: andw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: orl %ecx, %eax
-; X32-NEXT: # kill: def $ax killed $ax killed $eax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnew {{[0-9]+}}(%esp), %ax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_i16:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: leal -1(%eax), %ecx
-; X32-NOCMOV-NEXT: andw {{[0-9]+}}(%esp), %cx
-; X32-NOCMOV-NEXT: negl %eax
-; X32-NOCMOV-NEXT: andw {{[0-9]+}}(%esp), %ax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
-; X32-NOCMOV-NEXT: # kill: def $ax killed $ax killed $eax
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbw %bh, %si
+; X32-NOCMOV-NEXT: negw %si
+; X32-NOCMOV-NEXT: movw %dx, %ax
+; X32-NOCMOV-NEXT: andw %si, %ax
+; X32-NOCMOV-NEXT: notw %si
+; X32-NOCMOV-NEXT: andw %cx, %si
+; X32-NOCMOV-NEXT: orw %si, %ax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
ret i16 %result
@@ -90,38 +88,42 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_i32:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: leal -1(%rdi), %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: negl %ecx
-; X64-NEXT: andl %esi, %ecx
-; X64-NEXT: andl %edx, %eax
-; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_i32:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_i32:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
ret i32 %result
@@ -130,56 +132,66 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
; X64-LABEL: test_ctselect_i64:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: leaq -1(%rdi), %rax
-; X64-NEXT: negq %rdi
-; X64-NEXT: andq %rsi, %rdi
-; X64-NEXT: andq %rdx, %rax
-; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_i64:
; X32: # %bb.0:
-; X32-NEXT: pushl %esi
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .cfi_offset %esi, -8
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: xorl %edx, %eax
-; X32-NEXT: andl $1, %esi
-; X32-NEXT: negl %esi
-; X32-NEXT: andl %esi, %eax
-; X32-NEXT: xorl %edx, %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: andl %esi, %edx
-; X32-NEXT: xorl %ecx, %edx
-; X32-NEXT: popl %esi
-; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_i64:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: pushl %ebp
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: .cfi_offset %esi, -8
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: pushl %edi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: xorl %edx, %eax
-; X32-NOCMOV-NEXT: andl $1, %esi
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT: testb $1, %bl
+; X32-NOCMOV-NEXT: sete %bh
+; X32-NOCMOV-NEXT: movb %bh, %cl
+; X32-NOCMOV-NEXT: movzbl %cl, %esi
; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
; X32-NOCMOV-NEXT: andl %esi, %eax
-; X32-NOCMOV-NEXT: xorl %edx, %eax
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ebp, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: testb $1, %bl
+; X32-NOCMOV-NEXT: sete %cl
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT: movb %cl, %ch
+; X32-NOCMOV-NEXT: movzbl %ch, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edi, %edx
; X32-NOCMOV-NEXT: andl %esi, %edx
-; X32-NOCMOV-NEXT: xorl %ecx, %edx
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ebx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %edx
; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: popl %edi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebp
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
@@ -189,51 +201,74 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
; X64-LABEL: test_ctselect_f32:
; X64: # %bb.0:
-; X64-NEXT: movd %xmm1, %eax
-; X64-NEXT: movd %xmm0, %ecx
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: movl %edi, %edx
-; X64-NEXT: negl %edx
-; X64-NEXT: andl %ecx, %edx
-; X64-NEXT: decl %edi
-; X64-NEXT: andl %eax, %edi
-; X64-NEXT: orl %edx, %edi
-; X64-NEXT: movd %edi, %xmm0
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: movd %xmm1, %ecx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %eax, %ecx
+; X64-NEXT: movd %ecx, %xmm0
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_f32:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
+; X32-NEXT: pushl %edi
; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
-; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: sete %al
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: flds (%esp)
-; X32-NEXT: popl %eax
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_f32:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %eax
+; X32-NOCMOV-NEXT: pushl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
-; X32-NOCMOV-NEXT: movl %eax, (%esp)
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: pushl %eax
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %al
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movb %al, %ah
+; X32-NOCMOV-NEXT: movzbl %ah, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %ecx, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: movl %esi, (%esp)
; X32-NOCMOV-NEXT: flds (%esp)
-; X32-NOCMOV-NEXT: popl %eax
+; X32-NOCMOV-NEXT: addl $4, %esp
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
@@ -243,74 +278,96 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
; X64-LABEL: test_ctselect_f64:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: movq %xmm1, %rax
-; X64-NEXT: movq %xmm0, %rcx
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: movq %rdi, %rdx
-; X64-NEXT: negq %rdx
-; X64-NEXT: andq %rcx, %rdx
-; X64-NEXT: decq %rdi
-; X64-NEXT: andq %rax, %rdi
-; X64-NEXT: orq %rdx, %rdi
-; X64-NEXT: movq %rdi, %xmm0
+; X64-NEXT: movq %xmm0, %rax
+; X64-NEXT: movq %xmm1, %rcx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rax, %rcx
+; X64-NEXT: movq %rcx, %xmm0
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_f64:
; X32: # %bb.0:
-; X32-NEXT: pushl %esi
+; X32-NEXT: pushl %edi
; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: subl $8, %esp
-; X32-NEXT: .cfi_def_cfa_offset 16
-; X32-NEXT: .cfi_offset %esi, -8
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: .cfi_def_cfa_offset 20
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: sete %al
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: xorl %edx, %esi
-; X32-NEXT: andl $1, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl %ecx, %esi
-; X32-NEXT: xorl %edx, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %eax, %edx
-; X32-NEXT: andl %ecx, %edx
-; X32-NEXT: xorl %eax, %edx
-; X32-NEXT: movl %edx, (%esp)
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X32-NEXT: fldl (%esp)
; X32-NEXT: addl $8, %esp
-; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_f64:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: pushl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
; X32-NOCMOV-NEXT: subl $8, %esp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT: .cfi_offset %esi, -8
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %al
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT: xorl %edx, %esi
-; X32-NOCMOV-NEXT: andl $1, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl %ecx, %esi
-; X32-NOCMOV-NEXT: xorl %edx, %esi
-; X32-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: movb %al, %ah
+; X32-NOCMOV-NEXT: movzbl %ah, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %ecx, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: movl %esi, (%esp)
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: xorl %eax, %edx
-; X32-NOCMOV-NEXT: andl %ecx, %edx
-; X32-NOCMOV-NEXT: xorl %eax, %edx
-; X32-NOCMOV-NEXT: movl %edx, (%esp)
+; X32-NOCMOV-NEXT: movb %al, %ah
+; X32-NOCMOV-NEXT: movzbl %ah, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %ecx, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X32-NOCMOV-NEXT: fldl (%esp)
; X32-NOCMOV-NEXT: addl $8, %esp
-; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
@@ -320,37 +377,42 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
; X64-LABEL: test_ctselect_ptr:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: leaq -1(%rdi), %rax
-; X64-NEXT: negq %rdi
-; X64-NEXT: andq %rsi, %rdi
-; X64-NEXT: andq %rdx, %rax
-; X64-NEXT: orq %rdi, %rax
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovneq %rsi, %rax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_ptr:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_ptr:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
ret ptr %result
@@ -360,17 +422,45 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_const_true:
; X64: # %bb.0:
-; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: movb $1, %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_const_true:
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movb $1, %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_const_true:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movb $1, %al
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
ret i32 %result
@@ -380,18 +470,44 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_const_false:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %eax
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_const_false:
; X32: # %bb.0:
-; X32-NEXT: xorl %eax, %eax
-; X32-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_const_false:
; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NOCMOV-NEXT: xorl %eax, %eax
-; X32-NOCMOV-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
ret i32 %result
@@ -401,43 +517,50 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_icmp_eq:
; X64: # %bb.0:
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cmpl %esi, %edi
-; X64-NEXT: sete %al
-; X64-NEXT: movl %eax, %esi
-; X64-NEXT: negl %esi
-; X64-NEXT: andl %edx, %esi
-; X64-NEXT: decl %eax
-; X64-NEXT: andl %ecx, %eax
-; X64-NEXT: orl %esi, %eax
+; X64-NEXT: sete %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_icmp_eq:
; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: xorl %eax, %eax
; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: sete %al
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: sete %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_icmp_eq:
; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %eax
-; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X32-NOCMOV-NEXT: sete %al
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%cond = icmp eq i32 %x, %y
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -447,43 +570,50 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_icmp_ne:
; X64: # %bb.0:
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cmpl %esi, %edi
-; X64-NEXT: setne %al
-; X64-NEXT: movl %eax, %esi
-; X64-NEXT: negl %esi
-; X64-NEXT: andl %edx, %esi
-; X64-NEXT: decl %eax
-; X64-NEXT: andl %ecx, %eax
-; X64-NEXT: orl %esi, %eax
+; X64-NEXT: setne %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_icmp_ne:
; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: xorl %eax, %eax
; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: setne %al
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: setne %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_icmp_ne:
; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %eax
-; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X32-NOCMOV-NEXT: setne %al
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%cond = icmp ne i32 %x, %y
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -493,43 +623,50 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_icmp_slt:
; X64: # %bb.0:
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cmpl %esi, %edi
-; X64-NEXT: setl %al
-; X64-NEXT: movl %eax, %esi
-; X64-NEXT: negl %esi
-; X64-NEXT: andl %edx, %esi
-; X64-NEXT: decl %eax
-; X64-NEXT: andl %ecx, %eax
-; X64-NEXT: orl %esi, %eax
+; X64-NEXT: setl %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_icmp_slt:
; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: xorl %eax, %eax
; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: setl %al
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: setl %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_icmp_slt:
; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %eax
-; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X32-NOCMOV-NEXT: setl %al
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%cond = icmp slt i32 %x, %y
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -539,39 +676,50 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
; X64-LABEL: test_ctselect_icmp_ult:
; X64: # %bb.0:
-; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cmpl %esi, %edi
-; X64-NEXT: sbbl %eax, %eax
-; X64-NEXT: andl %eax, %edx
-; X64-NEXT: notl %eax
-; X64-NEXT: andl %ecx, %eax
-; X64-NEXT: orl %edx, %eax
+; X64-NEXT: setb %cl
+; X64-NEXT: testb %cl, %cl
+; X64-NEXT: cmovnel %edx, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_icmp_ult:
; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: xorl %eax, %eax
; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: sbbl %eax, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: andl %eax, %ecx
-; X32-NEXT: notl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
+; X32-NEXT: setb %cl
+; X32-NEXT: testb %cl, %cl
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_icmp_ult:
; X32-NOCMOV: # %bb.0:
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: xorl %eax, %eax
-; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: sbbl %eax, %eax
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: andl %eax, %ecx
-; X32-NOCMOV-NEXT: notl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: setb %al
+; X32-NOCMOV-NEXT: testb %al, %al
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
+; X32-NOCMOV-NEXT: negl %esi
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
+; X32-NOCMOV-NEXT: orl %esi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%cond = icmp ult i32 %x, %y
%result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -581,45 +729,64 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
; X64-LABEL: test_ctselect_fcmp_oeq:
; X64: # %bb.0:
-; X64-NEXT: movd %xmm3, %eax
-; X64-NEXT: cmpeqss %xmm1, %xmm0
-; X64-NEXT: movd %xmm0, %ecx
-; X64-NEXT: pand %xmm2, %xmm0
-; X64-NEXT: movd %xmm0, %edx
-; X64-NEXT: notl %ecx
-; X64-NEXT: andl %eax, %ecx
-; X64-NEXT: orl %edx, %ecx
+; X64-NEXT: movd %xmm2, %eax
+; X64-NEXT: movd %xmm3, %ecx
+; X64-NEXT: ucomiss %xmm1, %xmm0
+; X64-NEXT: setnp %dl
+; X64-NEXT: sete %sil
+; X64-NEXT: testb %dl, %sil
+; X64-NEXT: cmovnel %eax, %ecx
; X64-NEXT: movd %ecx, %xmm0
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_fcmp_oeq:
; X32: # %bb.0:
-; X32-NEXT: pushl %eax
+; X32-NEXT: pushl %edi
; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %edi, -8
; X32-NEXT: flds {{[0-9]+}}(%esp)
; X32-NEXT: flds {{[0-9]+}}(%esp)
; X32-NEXT: fucompi %st(1), %st
; X32-NEXT: fstp %st(0)
; X32-NEXT: setnp %al
; X32-NEXT: sete %cl
-; X32-NEXT: andb %al, %cl
-; X32-NEXT: movzbl %cl, %eax
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: negl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
-; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: testb %al, %cl
+; X32-NEXT: sete %al
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movb %al, %ah
+; X32-NEXT: movzbl %ah, %edi
+; X32-NEXT: negl %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: andl %edi, %esi
+; X32-NEXT: notl %edi
+; X32-NEXT: andl %ecx, %edi
+; X32-NEXT: orl %edi, %esi
+; X32-NEXT: movl %esi, (%esp)
; X32-NEXT: flds (%esp)
-; X32-NEXT: popl %eax
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: popl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: popl %edi
; X32-NEXT: .cfi_def_cfa_offset 4
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %eax
+; X32-NOCMOV-NEXT: pushl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: pushl %eax
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -8
; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
; X32-NOCMOV-NEXT: flds {{[0-9]+}}(%esp)
; X32-NOCMOV-NEXT: fucompp
@@ -628,17 +795,25 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
; X32-NOCMOV-NEXT: sahf
; X32-NOCMOV-NEXT: setnp %al
; X32-NOCMOV-NEXT: sete %cl
-; X32-NOCMOV-NEXT: andb %al, %cl
-; X32-NOCMOV-NEXT: movzbl %cl, %eax
-; X32-NOCMOV-NEXT: movl %eax, %ecx
-; X32-NOCMOV-NEXT: negl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %ecx, %eax
-; X32-NOCMOV-NEXT: movl %eax, (%esp)
+; X32-NOCMOV-NEXT: testb %al, %cl
+; X32-NOCMOV-NEXT: sete %al
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: movb %al, %ah
+; X32-NOCMOV-NEXT: movzbl %ah, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %ecx, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: movl %esi, (%esp)
; X32-NOCMOV-NEXT: flds (%esp)
-; X32-NOCMOV-NEXT: popl %eax
+; X32-NOCMOV-NEXT: addl $4, %esp
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %edi
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%cond = fcmp oeq float %x, %y
@@ -650,51 +825,45 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
; X64-LABEL: test_ctselect_load:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: leal -1(%rdi), %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: negl %ecx
-; X64-NEXT: andl (%rsi), %ecx
-; X64-NEXT: andl (%rdx), %eax
-; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel (%rsi), %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_load:
; X32: # %bb.0:
-; X32-NEXT: pushl %esi
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .cfi_offset %esi, -8
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: negl %esi
-; X32-NEXT: andl (%edx), %esi
-; X32-NEXT: decl %eax
-; X32-NEXT: andl (%ecx), %eax
-; X32-NEXT: orl %esi, %eax
-; X32-NEXT: popl %esi
-; X32-NEXT: .cfi_def_cfa_offset 4
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel (%ecx), %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_load:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: pushl %ebx
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT: .cfi_offset %esi, -8
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: movl %eax, %esi
+; X32-NOCMOV-NEXT: movl (%ecx), %ecx
+; X32-NOCMOV-NEXT: movl (%eax), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %esi
; X32-NOCMOV-NEXT: negl %esi
-; X32-NOCMOV-NEXT: andl (%edx), %esi
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl (%ecx), %eax
+; X32-NOCMOV-NEXT: movl %edx, %eax
+; X32-NOCMOV-NEXT: andl %esi, %eax
+; X32-NOCMOV-NEXT: notl %esi
+; X32-NOCMOV-NEXT: andl %ecx, %esi
; X32-NOCMOV-NEXT: orl %esi, %eax
; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%a = load i32, ptr %p1
@@ -707,62 +876,63 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
; X64-LABEL: test_ctselect_nested:
; X64: # %bb.0:
-; X64-NEXT: # kill: def $esi killed $esi def $rsi
-; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: andl $1, %esi
-; X64-NEXT: leal -1(%rsi), %r9d
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: negl %eax
-; X64-NEXT: andl %edx, %eax
-; X64-NEXT: andl %ecx, %r9d
-; X64-NEXT: orl %eax, %r9d
-; X64-NEXT: andl $1, %edi
-; X64-NEXT: leal -1(%rdi), %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: negl %ecx
-; X64-NEXT: andl %r9d, %ecx
-; X64-NEXT: andl %r8d, %eax
-; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: movl %r8d, %eax
+; X64-NEXT: testb $1, %sil
+; X64-NEXT: cmovnel %edx, %ecx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %ecx, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_ctselect_nested:
; X32: # %bb.0:
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: andl $1, %ecx
-; X32-NEXT: movl %ecx, %edx
-; X32-NEXT: negl %edx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: decl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: orl %edx, %ecx
-; X32-NEXT: andl $1, %eax
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: negl %edx
-; X32-NEXT: andl %ecx, %edx
-; X32-NEXT: decl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %edx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT: cmovnel %ecx, %eax
; X32-NEXT: retl
;
; X32-NOCMOV-LABEL: test_ctselect_nested:
; X32-NOCMOV: # %bb.0:
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: andl $1, %ecx
-; X32-NOCMOV-NEXT: movl %ecx, %edx
-; X32-NOCMOV-NEXT: negl %edx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT: decl %ecx
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT: orl %edx, %ecx
-; X32-NOCMOV-NEXT: andl $1, %eax
-; X32-NOCMOV-NEXT: movl %eax, %edx
-; X32-NOCMOV-NEXT: negl %edx
-; X32-NOCMOV-NEXT: andl %ecx, %edx
-; X32-NOCMOV-NEXT: decl %eax
-; X32-NOCMOV-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT: orl %edx, %eax
+; X32-NOCMOV-NEXT: pushl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: pushl %edi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: pushl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT: .cfi_offset %esi, -16
+; X32-NOCMOV-NEXT: .cfi_offset %edi, -12
+; X32-NOCMOV-NEXT: .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %bl
+; X32-NOCMOV-NEXT: movb %bl, %bh
+; X32-NOCMOV-NEXT: movzbl %bh, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %edx, %esi
+; X32-NOCMOV-NEXT: andl %edi, %esi
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %eax, %edi
+; X32-NOCMOV-NEXT: orl %edi, %esi
+; X32-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT: sete %dl
+; X32-NOCMOV-NEXT: movb %dl, %dh
+; X32-NOCMOV-NEXT: movzbl %dh, %edi
+; X32-NOCMOV-NEXT: negl %edi
+; X32-NOCMOV-NEXT: movl %ecx, %eax
+; X32-NOCMOV-NEXT: andl %edi, %eax
+; X32-NOCMOV-NEXT: notl %edi
+; X32-NOCMOV-NEXT: andl %esi, %edi
+; X32-NOCMOV-NEXT: orl %edi, %eax
+; X32-NOCMOV-NEXT: popl %esi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT: popl %edi
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT: popl %ebx
+; X32-NOCMOV-NEXT: .cfi_def_cfa_offset 4
; X32-NOCMOV-NEXT: retl
%inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
%result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)
>From 4f620622c0aafa01754dfebf076391c103810c2d Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 23:56:12 -0500
Subject: [PATCH 2/2] [LLVM][X86] Add f80 support for ct.select
Add special handling for x86_fp80 types in CTSELECT lowering by splitting
them into three 32-bit chunks, performing constant-time selection on each
chunk, and reassembling the result. This fixes crashes when compiling
tests with f80 types.
Also updated ctselect.ll to match current generic fallback implementation.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 5300 +++++++++++----------
llvm/lib/Target/X86/X86ISelLowering.h | 3781 ++++++++-------
llvm/lib/Target/X86/X86InstrInfo.cpp | 919 ++--
llvm/lib/Target/X86/X86InstrInfo.h | 21 +-
llvm/lib/Target/X86/X86TargetMachine.cpp | 15 +-
llvm/test/CodeGen/X86/ctselect-i386-fp.ll | 272 +-
6 files changed, 5209 insertions(+), 5099 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 833afa717c32c..7c5de8a834d79 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29,9 +29,9 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -193,10 +193,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// We don't accept any truncstore of integer registers.
setTruncStoreAction(MVT::i64, MVT::i32, Expand);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
- setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+ setTruncStoreAction(MVT::i64, MVT::i8, Expand);
setTruncStoreAction(MVT::i32, MVT::i16, Expand);
- setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
- setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+ setTruncStoreAction(MVT::i32, MVT::i8, Expand);
+ setTruncStoreAction(MVT::i16, MVT::i8, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -208,106 +208,106 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Integer absolute.
if (Subtarget.canUseCMOV()) {
- setOperationAction(ISD::ABS , MVT::i16 , Custom);
- setOperationAction(ISD::ABS , MVT::i32 , Custom);
+ setOperationAction(ISD::ABS, MVT::i16, Custom);
+ setOperationAction(ISD::ABS, MVT::i32, Custom);
if (Subtarget.is64Bit())
- setOperationAction(ISD::ABS , MVT::i64 , Custom);
+ setOperationAction(ISD::ABS, MVT::i64, Custom);
}
// Absolute difference.
for (auto Op : {ISD::ABDS, ISD::ABDU}) {
- setOperationAction(Op , MVT::i8 , Custom);
- setOperationAction(Op , MVT::i16 , Custom);
- setOperationAction(Op , MVT::i32 , Custom);
+ setOperationAction(Op, MVT::i8, Custom);
+ setOperationAction(Op, MVT::i16, Custom);
+ setOperationAction(Op, MVT::i32, Custom);
if (Subtarget.is64Bit())
- setOperationAction(Op , MVT::i64 , Custom);
+ setOperationAction(Op, MVT::i64, Custom);
}
// Signed saturation subtraction.
- setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);
- setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);
- setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::i32, Custom);
if (Subtarget.is64Bit())
- setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::i64, Custom);
// Funnel shifts.
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
// For slow shld targets we only lower for code size.
LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
- setOperationAction(ShiftOp , MVT::i8 , Custom);
- setOperationAction(ShiftOp , MVT::i16 , Custom);
- setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
+ setOperationAction(ShiftOp, MVT::i8, Custom);
+ setOperationAction(ShiftOp, MVT::i16, Custom);
+ setOperationAction(ShiftOp, MVT::i32, ShiftDoubleAction);
if (Subtarget.is64Bit())
- setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
+ setOperationAction(ShiftOp, MVT::i64, ShiftDoubleAction);
}
if (!Subtarget.useSoftFloat()) {
// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
// operation.
- setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
- setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
// We have an algorithm for SSE2, and we turn this into a 64-bit
// FILD or VCVTUSI2SS/SD for other targets.
- setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
// We have an algorithm for SSE2->double, and we turn this into a
// 64-bit FILD followed by conditional FADD for other targets.
- setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
// this operation.
- setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
// SSE has no i16 to fp conversion, only i32. We promote in the handler
// to allow f80 to use i16 and f64 to use i16 with sse1 only
- setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
- setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
// are Legal, f80 is custom lowered.
- setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
// this operation.
- setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
// FIXME: This doesn't generate invalid exception when it should. PR44019.
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
- setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
// are Legal, f80 is custom lowered.
- setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
// Handle FP_TO_UINT by promoting the destination to a larger signed
// conversion.
- setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
// FIXME: This doesn't generate invalid exception when it should. PR44019.
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
- setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
// FIXME: This doesn't generate invalid exception when it should. PR44019.
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
- setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
- setOperationAction(ISD::LRINT, MVT::f32, Custom);
- setOperationAction(ISD::LRINT, MVT::f64, Custom);
- setOperationAction(ISD::LLRINT, MVT::f32, Custom);
- setOperationAction(ISD::LLRINT, MVT::f64, Custom);
+ setOperationAction(ISD::LRINT, MVT::f32, Custom);
+ setOperationAction(ISD::LRINT, MVT::f64, Custom);
+ setOperationAction(ISD::LLRINT, MVT::f32, Custom);
+ setOperationAction(ISD::LLRINT, MVT::f64, Custom);
if (!Subtarget.is64Bit()) {
- setOperationAction(ISD::LRINT, MVT::i64, Custom);
+ setOperationAction(ISD::LRINT, MVT::i64, Custom);
setOperationAction(ISD::LLRINT, MVT::i64, Custom);
}
}
@@ -315,7 +315,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasSSE2()) {
// Custom lowering for saturating float to int conversions.
// We handle promotion to larger result types manually.
- for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
+ for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) {
setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
}
@@ -348,17 +348,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
if (!Subtarget.hasSSE2()) {
- setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
- setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
+ setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+ setOperationAction(ISD::BITCAST, MVT::i32, Expand);
setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
if (Subtarget.is64Bit()) {
- setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
+ setOperationAction(ISD::BITCAST, MVT::f64, Expand);
// Without SSE, i64->f64 goes through memory.
- setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
+ setOperationAction(ISD::BITCAST, MVT::i64, Expand);
}
} else if (!Subtarget.is64Bit())
- setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
+ setOperationAction(ISD::BITCAST, MVT::i64, Custom);
// Scalar integer divide and remainder are lowered to use operations that
// produce two results, to match the available instructions. This exposes
@@ -370,7 +370,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// (low) operations are left as Legal, as there are single-result
// instructions for this in x86. Using the two-result multiply instructions
// when both high and low results are needed must be arranged by dagcombine.
- for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
@@ -379,47 +379,47 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UREM, VT, Expand);
}
- setOperationAction(ISD::BR_JT , MVT::Other, Expand);
- setOperationAction(ISD::BRCOND , MVT::Other, Custom);
- for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
- MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
- setOperationAction(ISD::BR_CC, VT, Expand);
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+ for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128, MVT::i8, MVT::i16,
+ MVT::i32, MVT::i64}) {
+ setOperationAction(ISD::BR_CC, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
}
if (Subtarget.is64Bit())
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
- setOperationAction(ISD::FREM , MVT::f32 , Expand);
- setOperationAction(ISD::FREM , MVT::f64 , Expand);
- setOperationAction(ISD::FREM , MVT::f80 , Expand);
- setOperationAction(ISD::FREM , MVT::f128 , Expand);
+ setOperationAction(ISD::FREM, MVT::f32, Expand);
+ setOperationAction(ISD::FREM, MVT::f64, Expand);
+ setOperationAction(ISD::FREM, MVT::f80, Expand);
+ setOperationAction(ISD::FREM, MVT::f128, Expand);
if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
- setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom);
- setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);
- setOperationAction(ISD::GET_FPENV_MEM , MVT::Other, Custom);
- setOperationAction(ISD::SET_FPENV_MEM , MVT::Other, Custom);
- setOperationAction(ISD::RESET_FPENV , MVT::Other, Custom);
+ setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
+ setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
+ setOperationAction(ISD::GET_FPENV_MEM, MVT::Other, Custom);
+ setOperationAction(ISD::SET_FPENV_MEM, MVT::Other, Custom);
+ setOperationAction(ISD::RESET_FPENV, MVT::Other, Custom);
}
// Promote the i8 variants and force them on up to i32 which has a shorter
// encoding.
- setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
- setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
+ setOperationPromotedToType(ISD::CTTZ, MVT::i8, MVT::i32);
+ setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8, MVT::i32);
// Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
// a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
// promote that too.
- setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
- setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32);
+ setOperationPromotedToType(ISD::CTTZ, MVT::i16, MVT::i32);
+ setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
if (!Subtarget.hasBMI()) {
- setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
+ setOperationAction(ISD::CTTZ, MVT::i32, Custom);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
if (Subtarget.is64Bit()) {
- setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
+ setOperationAction(ISD::CTTZ, MVT::i64, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
}
}
@@ -427,13 +427,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasLZCNT()) {
// When promoting the i8 variants, force them to i32 for a shorter
// encoding.
- setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
- setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
+ setOperationPromotedToType(ISD::CTLZ, MVT::i8, MVT::i32);
+ setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8, MVT::i32);
} else {
for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
- setOperationAction(ISD::CTLZ , VT, Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
}
}
@@ -478,31 +478,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// on the dest that popcntl hasn't had since Cannon Lake.
setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
} else {
- setOperationAction(ISD::CTPOP , MVT::i8 , Custom);
- setOperationAction(ISD::CTPOP , MVT::i16 , Custom);
- setOperationAction(ISD::CTPOP , MVT::i32 , Custom);
- setOperationAction(ISD::CTPOP , MVT::i64 , Custom);
+ setOperationAction(ISD::CTPOP, MVT::i8, Custom);
+ setOperationAction(ISD::CTPOP, MVT::i16, Custom);
+ setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+ setOperationAction(ISD::CTPOP, MVT::i64, Custom);
}
- setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
+ setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
if (!Subtarget.hasMOVBE())
- setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
+ setOperationAction(ISD::BSWAP, MVT::i16, Expand);
// X86 wants to expand cmov itself.
- for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
+ for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::CTSELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
}
- for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::CTSELECT, VT, Custom);
- setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
}
// Custom action for SELECT MMX and expand action for SELECT_CC MMX
@@ -510,7 +510,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTSELECT, MVT::x86mmx, Custom);
setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
- setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
+ setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
// LLVM/Clang supports zero-cost DWARF and SEH exception handling.
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
@@ -518,19 +518,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
// Darwin ABI issue.
- for (auto VT : { MVT::i32, MVT::i64 }) {
+ for (auto VT : {MVT::i32, MVT::i64}) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
- setOperationAction(ISD::ConstantPool , VT, Custom);
- setOperationAction(ISD::JumpTable , VT, Custom);
- setOperationAction(ISD::GlobalAddress , VT, Custom);
+ setOperationAction(ISD::ConstantPool, VT, Custom);
+ setOperationAction(ISD::JumpTable, VT, Custom);
+ setOperationAction(ISD::GlobalAddress, VT, Custom);
setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
- setOperationAction(ISD::ExternalSymbol , VT, Custom);
- setOperationAction(ISD::BlockAddress , VT, Custom);
+ setOperationAction(ISD::ExternalSymbol, VT, Custom);
+ setOperationAction(ISD::BlockAddress, VT, Custom);
}
// 64-bit shl, sra, srl (iff 32-bit x86)
- for (auto VT : { MVT::i32, MVT::i64 }) {
+ for (auto VT : {MVT::i32, MVT::i64}) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
setOperationAction(ISD::SHL_PARTS, VT, Custom);
@@ -539,12 +539,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (Subtarget.hasSSEPrefetch())
- setOperationAction(ISD::PREFETCH , MVT::Other, Custom);
+ setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
- setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
+ setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
// Expand certain atomics
- for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
@@ -588,14 +588,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
- setOperationAction(ISD::VASTART , MVT::Other, Custom);
- setOperationAction(ISD::VAEND , MVT::Other, Expand);
+ setOperationAction(ISD::VASTART, MVT::Other, Custom);
+ setOperationAction(ISD::VAEND, MVT::Other, Expand);
bool Is64Bit = Subtarget.is64Bit();
- setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
+ setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
- setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
- setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
@@ -605,7 +605,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
- auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
+ auto setF16Action = [&](MVT VT, LegalizeAction Action) {
setOperationAction(ISD::FABS, VT, Action);
setOperationAction(ISD::FNEG, VT, Action);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
@@ -661,7 +661,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// non-optsize case.
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
- for (auto VT : { MVT::f32, MVT::f64 }) {
+ for (auto VT : {MVT::f32, MVT::f64}) {
// Use ANDPD to simulate FABS.
setOperationAction(ISD::FABS, VT, Custom);
@@ -676,8 +676,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FSUB, VT, Custom);
// We don't support sin/cos/fmod
- setOperationAction(ISD::FSIN , VT, Expand);
- setOperationAction(ISD::FCOS , VT, Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
}
@@ -740,10 +740,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::f64, &X86::RFP64RegClass);
// Use ANDPS to simulate FABS.
- setOperationAction(ISD::FABS , MVT::f32, Custom);
+ setOperationAction(ISD::FABS, MVT::f32, Custom);
// Use XORP to simulate FNEG.
- setOperationAction(ISD::FNEG , MVT::f32, Custom);
+ setOperationAction(ISD::FNEG, MVT::f32, Custom);
if (UseX87)
setOperationAction(ISD::UNDEF, MVT::f64, Expand);
@@ -754,8 +754,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
// We don't support sin/cos/fmod
- setOperationAction(ISD::FSIN , MVT::f32, Expand);
- setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSIN, MVT::f32, Expand);
+ setOperationAction(ISD::FCOS, MVT::f32, Expand);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
if (UseX87) {
@@ -770,13 +770,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::f64, &X86::RFP64RegClass);
addRegisterClass(MVT::f32, &X86::RFP32RegClass);
- for (auto VT : { MVT::f32, MVT::f64 }) {
- setOperationAction(ISD::UNDEF, VT, Expand);
+ for (auto VT : {MVT::f32, MVT::f64}) {
+ setOperationAction(ISD::UNDEF, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
// Always expand sin/cos functions even though x87 has an instruction.
- setOperationAction(ISD::FSIN , VT, Expand);
- setOperationAction(ISD::FCOS , VT, Expand);
+ setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
}
}
@@ -788,7 +788,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addLegalFPImmediate(APFloat(+1.0f)); // FLD1
addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
- } else // SSE immediates.
+ } else // SSE immediates.
addLegalFPImmediate(APFloat(+0.0f)); // xorps
}
// Expand FP64 immediates into loads from the stack, save special cases.
@@ -798,7 +798,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addLegalFPImmediate(APFloat(+1.0)); // FLD1
addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
- } else // SSE immediates.
+ } else // SSE immediates.
addLegalFPImmediate(APFloat(+0.0)); // xorpd
}
// Support fp16 0 immediate.
@@ -806,18 +806,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
// Handle constrained floating-point operations of scalar.
- setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
- setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
- setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
- setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
- setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
- setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
- setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
- setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
- setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
- setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
// We don't support FMA.
setOperationAction(ISD::FMA, MVT::f64, Expand);
@@ -826,21 +826,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// f80 always uses X87.
if (UseX87) {
addRegisterClass(MVT::f80, &X86::RFP80RegClass);
- setOperationAction(ISD::UNDEF, MVT::f80, Expand);
+ setOperationAction(ISD::UNDEF, MVT::f80, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
{
APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
- addLegalFPImmediate(TmpFlt); // FLD0
+ addLegalFPImmediate(TmpFlt); // FLD0
TmpFlt.changeSign();
- addLegalFPImmediate(TmpFlt); // FLD0/FCHS
+ addLegalFPImmediate(TmpFlt); // FLD0/FCHS
bool ignored;
APFloat TmpFlt2(+1.0);
- TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
- &ignored);
- addLegalFPImmediate(TmpFlt2); // FLD1
+ TmpFlt2.convert(APFloat::x87DoubleExtended(),
+ APFloat::rmNearestTiesToEven, &ignored);
+ addLegalFPImmediate(TmpFlt2); // FLD1
TmpFlt2.changeSign();
- addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
+ addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
}
// Always expand sin/cos functions even though x87 has an instruction.
@@ -859,9 +859,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// clang-format on
setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
- setOperationAction(ISD::FCEIL, MVT::f80, Expand);
+ setOperationAction(ISD::FCEIL, MVT::f80, Expand);
setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
- setOperationAction(ISD::FRINT, MVT::f80, Expand);
+ setOperationAction(ISD::FRINT, MVT::f80, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
setOperationAction(ISD::FMA, MVT::f80, Expand);
@@ -871,12 +871,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LLRINT, MVT::f80, Custom);
// Handle constrained floating-point operations of scalar.
- setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
- setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
- setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
- setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
- setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
- setOperationAction(ISD::FCANONICALIZE , MVT::f80, Custom);
+ setOperationAction(ISD::STRICT_FADD, MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f80, Legal);
+ setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom);
if (isTypeLegal(MVT::f16)) {
setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
@@ -895,16 +895,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
- setOperationAction(ISD::FADD, MVT::f128, LibCall);
+ setOperationAction(ISD::FADD, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
- setOperationAction(ISD::FSUB, MVT::f128, LibCall);
+ setOperationAction(ISD::FSUB, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
- setOperationAction(ISD::FDIV, MVT::f128, LibCall);
+ setOperationAction(ISD::FDIV, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
- setOperationAction(ISD::FMUL, MVT::f128, LibCall);
+ setOperationAction(ISD::FMUL, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
- setOperationAction(ISD::FMA, MVT::f128, LibCall);
- setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
+ setOperationAction(ISD::FMA, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
setOperationAction(ISD::FABS, MVT::f128, Custom);
setOperationAction(ISD::FNEG, MVT::f128, Custom);
@@ -920,10 +920,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FTAN, MVT::f128, LibCall);
// clang-format on
// No STRICT_FSINCOS
- setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
+ setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
- setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
// We need to custom handle any FP_ROUND with an f128 input, but
// LegalizeDAG uses the result type to know when to run a custom handler.
@@ -953,10 +953,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Always use a library call for pow.
- setOperationAction(ISD::FPOW , MVT::f32 , Expand);
- setOperationAction(ISD::FPOW , MVT::f64 , Expand);
- setOperationAction(ISD::FPOW , MVT::f80 , Expand);
- setOperationAction(ISD::FPOW , MVT::f128 , Expand);
+ setOperationAction(ISD::FPOW, MVT::f32, Expand);
+ setOperationAction(ISD::FPOW, MVT::f64, Expand);
+ setOperationAction(ISD::FPOW, MVT::f80, Expand);
+ setOperationAction(ISD::FPOW, MVT::f128, Expand);
setOperationAction(ISD::FLOG, MVT::f80, Expand);
setOperationAction(ISD::FLOG2, MVT::f80, Expand);
@@ -968,9 +968,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
// Some FP actions are always expanded for vector types.
- for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
- MVT::v4f32, MVT::v8f32, MVT::v16f32,
- MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
+ for (auto VT : {MVT::v8f16, MVT::v16f16, MVT::v32f16, MVT::v4f32, MVT::v8f32,
+ MVT::v16f32, MVT::v2f64, MVT::v4f64, MVT::v8f64}) {
// clang-format off
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FSINCOS, VT, Expand);
@@ -996,11 +995,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
- setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
- setOperationAction(ISD::FMA, VT, Expand);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Expand);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Expand);
+ setOperationAction(ISD::FMA, VT, Expand);
setOperationAction(ISD::FFLOOR, VT, Expand);
setOperationAction(ISD::FCEIL, VT, Expand);
setOperationAction(ISD::FTRUNC, VT, Expand);
@@ -1024,7 +1023,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, VT, Expand);
setOperationAction(ISD::UINT_TO_FP, VT, Expand);
setOperationAction(ISD::SINT_TO_FP, VT, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
setOperationAction(ISD::TRUNCATE, VT, Expand);
setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
@@ -1062,31 +1061,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
- setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
- setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
- setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
- setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
-
- setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
- setOperationAction(ISD::FABS, MVT::v4f32, Custom);
- setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
- setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
+
+ setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f32, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
- setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::CTSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom);
- setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
- setOperationAction(ISD::STORE, MVT::v2f32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
+ setOperationAction(ISD::STORE, MVT::v2f32, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
- setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
- setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
- setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
- setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -1106,74 +1105,74 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
: &X86::VR128RegClass);
- for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
+ for (auto VT : {MVT::f64, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FMAXIMUM, VT, Custom);
setOperationAction(ISD::FMINIMUM, VT, Custom);
setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
}
- for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
- MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
+ for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16,
+ MVT::v2i32}) {
setOperationAction(ISD::SDIV, VT, Custom);
setOperationAction(ISD::SREM, VT, Custom);
setOperationAction(ISD::UDIV, VT, Custom);
setOperationAction(ISD::UREM, VT, Custom);
}
- setOperationAction(ISD::MUL, MVT::v2i8, Custom);
- setOperationAction(ISD::MUL, MVT::v4i8, Custom);
- setOperationAction(ISD::MUL, MVT::v8i8, Custom);
-
- setOperationAction(ISD::MUL, MVT::v16i8, Custom);
- setOperationAction(ISD::MUL, MVT::v4i32, Custom);
- setOperationAction(ISD::MUL, MVT::v2i64, Custom);
- setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
- setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
- setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
- setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
- setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
- setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
- setOperationAction(ISD::MUL, MVT::v8i16, Legal);
- setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
- setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
-
- setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
- setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
- setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
-
- setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i8, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i8, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i8, Custom);
+
+ setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
+ setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
+ setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
+ setOperationAction(ISD::MUL, MVT::v8i16, Legal);
+ setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
+ setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
+
+ setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
+ setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
+ setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
+
+ setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Custom);
- setOperationAction(ISD::FABS, MVT::v2f64, Custom);
- setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v2f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
- for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
}
- setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
- setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
- setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
- setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
- setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
- setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
- setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
- setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
- setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
- setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
+ setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
+ setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
+ setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
+ setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
+ setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
+ setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
+ setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
+ setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
- for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ABS, VT, Custom);
@@ -1186,30 +1185,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setCondCodeAction(ISD::SETLE, VT, Custom);
}
- setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
- setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
- setOperationAction(ISD::STRICT_FSETCC, MVT::v2f64, Custom);
- setOperationAction(ISD::STRICT_FSETCC, MVT::v4f32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::v2f64, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::v4f32, Custom);
setOperationAction(ISD::STRICT_FSETCCS, MVT::v2f64, Custom);
setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f32, Custom);
- for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
- setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Custom);
+ for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) {
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
- for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Custom);
+ for (auto VT : {MVT::v8f16, MVT::v2f64, MVT::v2i64}) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
if (VT == MVT::v2i64 && !Subtarget.is64Bit())
continue;
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
}
setF16Action(MVT::v8f16, Expand);
@@ -1222,12 +1221,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Custom);
// Custom lower v2i64 and v2f64 selects.
- setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
- setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
- setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
- setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
- setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
- setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
+ setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
setOperationAction(ISD::CTSELECT, MVT::v2f64, Custom);
setOperationAction(ISD::CTSELECT, MVT::v2i64, Custom);
@@ -1236,60 +1235,60 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom);
setOperationAction(ISD::CTSELECT, MVT::v16i8, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
// Custom legalize these to avoid over promotion or custom promotion.
for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
- setOperationAction(ISD::FP_TO_SINT, VT, Custom);
- setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
}
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
- setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
- setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
- setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
// store.
- setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
- setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
- setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
- setOperationAction(ISD::STORE, MVT::v2i32, Custom);
- setOperationAction(ISD::STORE, MVT::v4i16, Custom);
- setOperationAction(ISD::STORE, MVT::v8i8, Custom);
+ setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
+ setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
+ setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i16, Custom);
+ setOperationAction(ISD::STORE, MVT::v8i8, Custom);
// Add 32-bit vector stores to help vectorization opportunities.
- setOperationAction(ISD::STORE, MVT::v2i16, Custom);
- setOperationAction(ISD::STORE, MVT::v4i8, Custom);
+ setOperationAction(ISD::STORE, MVT::v2i16, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i8, Custom);
- setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
- setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
- setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
@@ -1299,41 +1298,42 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v2i64, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v4i64, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i64, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i64, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
// In the customized shift lowering, the legal v4i32/v2i64 cases
// in AVX2 will be recognized.
- for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
- if (VT == MVT::v2i64) continue;
- setOperationAction(ISD::ROTL, VT, Custom);
- setOperationAction(ISD::ROTR, VT, Custom);
- setOperationAction(ISD::FSHL, VT, Custom);
- setOperationAction(ISD::FSHR, VT, Custom);
+ for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ if (VT == MVT::v2i64)
+ continue;
+ setOperationAction(ISD::ROTL, VT, Custom);
+ setOperationAction(ISD::ROTR, VT, Custom);
+ setOperationAction(ISD::FSHL, VT, Custom);
+ setOperationAction(ISD::FSHR, VT, Custom);
}
- setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
- setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
- setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
- setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
- setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
@@ -1348,73 +1348,73 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
- setOperationAction(ISD::ABS, MVT::v16i8, Legal);
- setOperationAction(ISD::ABS, MVT::v8i16, Legal);
- setOperationAction(ISD::ABS, MVT::v4i32, Legal);
+ setOperationAction(ISD::ABS, MVT::v16i8, Legal);
+ setOperationAction(ISD::ABS, MVT::v8i16, Legal);
+ setOperationAction(ISD::ABS, MVT::v4i32, Legal);
for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
- setOperationAction(ISD::BITREVERSE, VT, Custom);
- setOperationAction(ISD::CTLZ, VT, Custom);
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
}
// These might be better off as horizontal vector ops.
- setOperationAction(ISD::ADD, MVT::i16, Custom);
- setOperationAction(ISD::ADD, MVT::i32, Custom);
- setOperationAction(ISD::SUB, MVT::i16, Custom);
- setOperationAction(ISD::SUB, MVT::i32, Custom);
+ setOperationAction(ISD::ADD, MVT::i16, Custom);
+ setOperationAction(ISD::ADD, MVT::i32, Custom);
+ setOperationAction(ISD::SUB, MVT::i16, Custom);
+ setOperationAction(ISD::SUB, MVT::i32, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
- setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
- setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
- setOperationAction(ISD::FCEIL, RoundedTy, Legal);
- setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
- setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
- setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
- setOperationAction(ISD::FRINT, RoundedTy, Legal);
- setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
- setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
- setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
- setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
- setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
-
- setOperationAction(ISD::FROUND, RoundedTy, Custom);
- }
-
- setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
- setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
- setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
- setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
- setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
- setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
- setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
- setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
-
- setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
- setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
- setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
+ setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
+ setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
+
+ setOperationAction(ISD::FROUND, RoundedTy, Custom);
+ }
+
+ setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
+ setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
+ setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
+ setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
+ setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
+ setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
+ setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
+ setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
+
+ setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
+ setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
// FIXME: Do we need to handle scalar-to-vector here?
- setOperationAction(ISD::MUL, MVT::v4i32, Legal);
- setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+ setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
// We directly match byte blends in the backend as they match the VSELECT
// condition form.
- setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
+ setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
// SSE41 brings specific instructions for doing vector sign extend even in
// cases where we don't have SRA.
- for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
}
// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
- for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
- setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
- setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
- setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
+ for (auto LoadExtOp : {ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
+ setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
@@ -1423,73 +1423,73 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
// do the pre and post work in the vector domain.
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
// We need to mark SINT_TO_FP as Custom even though we want to expand it
// so that DAG combine doesn't try to turn it into uint_to_fp.
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
}
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
- setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
- for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
- MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+ for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8,
+ MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
}
// XOP can efficiently perform BITREVERSE with VPPERM.
- for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64})
setOperationAction(ISD::BITREVERSE, VT, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
bool HasInt256 = Subtarget.hasInt256();
- addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
- : &X86::VR256RegClass);
+ addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
: &X86::VR256RegClass);
- addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
- : &X86::VR256RegClass);
- addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
- : &X86::VR256RegClass);
- addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
- : &X86::VR256RegClass);
- addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
- : &X86::VR256RegClass);
-
- for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
- setOperationAction(ISD::FFLOOR, VT, Legal);
- setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
- setOperationAction(ISD::FCEIL, VT, Legal);
- setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
- setOperationAction(ISD::FTRUNC, VT, Legal);
- setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
- setOperationAction(ISD::FRINT, VT, Legal);
- setOperationAction(ISD::STRICT_FRINT, VT, Legal);
- setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+ addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
+ : &X86::VR256RegClass);
+
+ for (auto VT : {MVT::v8f32, MVT::v4f64}) {
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
- setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+ setOperationAction(ISD::FROUNDEVEN, VT, Legal);
setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
- setOperationAction(ISD::FROUND, VT, Custom);
+ setOperationAction(ISD::FROUND, VT, Custom);
- setOperationAction(ISD::FNEG, VT, Custom);
- setOperationAction(ISD::FABS, VT, Custom);
- setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
- setOperationAction(ISD::FMAXIMUM, VT, Custom);
- setOperationAction(ISD::FMINIMUM, VT, Custom);
- setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
- setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
+ setOperationAction(ISD::FMAXIMUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUM, VT, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
setOperationAction(ISD::FCANONICALIZE, VT, Custom);
}
@@ -1498,64 +1498,65 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);
-
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);
- setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);
- setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);
-
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
- setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
- setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
- setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
- setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
- setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
- setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
- setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
- setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
- setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
- setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);
+
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
// In the customized shift lowering, the legal v8i32/v4i64 cases
// in AVX2 will be recognized.
- for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
- setOperationAction(ISD::ABDS, VT, Custom);
- setOperationAction(ISD::ABDU, VT, Custom);
- if (VT == MVT::v4i64) continue;
- setOperationAction(ISD::ROTL, VT, Custom);
- setOperationAction(ISD::ROTR, VT, Custom);
- setOperationAction(ISD::FSHL, VT, Custom);
- setOperationAction(ISD::FSHR, VT, Custom);
+ for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::ABDS, VT, Custom);
+ setOperationAction(ISD::ABDU, VT, Custom);
+ if (VT == MVT::v4i64)
+ continue;
+ setOperationAction(ISD::ROTL, VT, Custom);
+ setOperationAction(ISD::ROTR, VT, Custom);
+ setOperationAction(ISD::FSHL, VT, Custom);
+ setOperationAction(ISD::FSHR, VT, Custom);
}
// These types need custom splitting if their input is a 128-bit vector.
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
-
- setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
- setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
- setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
- setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
- setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
- setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
- setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
+ setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
setOperationAction(ISD::CTSELECT, MVT::v4f64, Custom);
setOperationAction(ISD::CTSELECT, MVT::v4i64, Custom);
@@ -1565,22 +1566,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CTSELECT, MVT::v32i8, Custom);
setOperationAction(ISD::CTSELECT, MVT::v8f32, Custom);
- for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
- setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
- setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
- setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+ for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
+ setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+ setOperationAction(ISD::ANY_EXTEND, VT, Custom);
}
- setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v32i16, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v32i32, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v32i64, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i32, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i64, Custom);
- for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
- setOperationAction(ISD::SETCC, VT, Custom);
- setOperationAction(ISD::CTPOP, VT, Custom);
- setOperationAction(ISD::CTLZ, VT, Custom);
- setOperationAction(ISD::BITREVERSE, VT, Custom);
+ for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1588,64 +1589,64 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setCondCodeAction(ISD::SETLE, VT, Custom);
}
- setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
- setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
- setOperationAction(ISD::STRICT_FSETCC, MVT::v4f64, Custom);
- setOperationAction(ISD::STRICT_FSETCC, MVT::v8f32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::v4f64, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::v8f32, Custom);
setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f64, Custom);
setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f32, Custom);
if (Subtarget.hasAnyFMA()) {
- for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
- MVT::v2f64, MVT::v4f64 }) {
+ for (auto VT : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, MVT::v2f64,
+ MVT::v4f64}) {
setOperationAction(ISD::FMA, VT, Legal);
setOperationAction(ISD::STRICT_FMA, VT, Legal);
}
}
- for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+ for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
}
- setOperationAction(ISD::MUL, MVT::v4i64, Custom);
- setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::MUL, MVT::v32i8, Custom);
-
- setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
- setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
- setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
- setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
- setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
-
- setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
- setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
-
- setOperationAction(ISD::ABS, MVT::v4i64, Custom);
- setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
- setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
- setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
- setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
-
- setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
- setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
- setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
- setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
- setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
-
- for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
- setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
+ setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
+ setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
+ setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
+
+ setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
+ setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
+
+ setOperationAction(ISD::ABS, MVT::v4i64, Custom);
+ setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
+ setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
+ setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
+ setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
+
+ setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
+
+ for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32}) {
+ setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
@@ -1664,41 +1665,41 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
- for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
+ for (auto LoadExtOp : {ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
- setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
- setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
- setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
- setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
- setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
}
}
- for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
- MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
- setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
+ for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32,
+ MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
+ setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Legal);
}
// Extract subvector is special because the value type
// (result) is 128-bit but the source is 256-bit wide.
- for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
- MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
+ for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v8f16,
+ MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
}
// Custom lower several nodes for 256-bit types.
- for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
- MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v16f16,
+ MVT::v8f32, MVT::v4f64}) {
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
- setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
- setOperationAction(ISD::STORE, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::STORE, VT, Custom);
}
setF16Action(MVT::v16f16, Expand);
setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
@@ -1716,21 +1717,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
- for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
- MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
- setOperationAction(ISD::MGATHER, VT, Custom);
+ for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64})
+ setOperationAction(ISD::MGATHER, VT, Custom);
}
}
if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
Subtarget.hasF16C()) {
- for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
- setOperationAction(ISD::FP_ROUND, VT, Custom);
- setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
+ for (MVT VT : {MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16}) {
+ setOperationAction(ISD::FP_ROUND, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
}
- for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
- setOperationAction(ISD::FP_EXTEND, VT, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
+ for (MVT VT : {MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32}) {
+ setOperationAction(ISD::FP_EXTEND, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
}
for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
@@ -1744,29 +1745,29 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// available with AVX512. 512-bit vectors are in a separate block controlled
// by useAVX512Regs.
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
- addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
- addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
- addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
- addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
- addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
+ addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
+ addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
+ addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
+ addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
+ addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
- setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
+ setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
setOperationAction(ISD::CTSELECT, MVT::v1i1, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
-
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
- setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
- setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
- setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
- setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
+
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Custom);
@@ -1785,30 +1786,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
- for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
- setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+ setOperationAction(ISD::ANY_EXTEND, VT, Custom);
}
- for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
- setOperationAction(ISD::VSELECT, VT, Expand);
+ for (auto VT : {MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1})
+ setOperationAction(ISD::VSELECT, VT, Expand);
- for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
- setOperationAction(ISD::SETCC, VT, Custom);
- setOperationAction(ISD::SELECT, VT, Custom);
+ for (auto VT : {MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1}) {
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::CTSELECT, VT, Custom);
- setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
}
- for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
+ for (auto VT : {MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1})
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
}
if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
@@ -1826,30 +1827,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
- addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
- addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
+ addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
- addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
+ addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
- setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
+ setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
- setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
- setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
- setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
+ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
+ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
+ setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
if (HasBWI)
setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
}
- for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
+ for (MVT VT : {MVT::v16f32, MVT::v8f64}) {
setOperationAction(ISD::FMAXIMUM, VT, Custom);
setOperationAction(ISD::FMINIMUM, VT, Custom);
setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
- setOperationAction(ISD::FNEG, VT, Custom);
- setOperationAction(ISD::FABS, VT, Custom);
- setOperationAction(ISD::FMA, VT, Legal);
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Legal);
setOperationAction(ISD::STRICT_FMA, VT, Legal);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
setOperationAction(ISD::FCANONICALIZE, VT, Custom);
@@ -1861,93 +1862,93 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasDQI())
setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
- for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
- setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
- setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
+ for (MVT VT : {MVT::v16i1, MVT::v16i8}) {
+ setOperationPromotedToType(ISD::FP_TO_SINT, VT, MVT::v16i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, VT, MVT::v16i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
}
- for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
- setOperationAction(ISD::FP_TO_SINT, VT, Custom);
- setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+ for (MVT VT : {MVT::v16i16, MVT::v16i32}) {
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
}
- setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
- setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);
-
- setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
- setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
- setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
- setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
- setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
- setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
- setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
- setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
- setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
- setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
-
- setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
- setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
- setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
- setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
- setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);
+
+ setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
+
+ setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
+ setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
if (HasBWI)
- setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
+ setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
// to 512-bit rather than use the AVX2 instructions so that we can use
// k-masks.
if (!Subtarget.hasVLX()) {
for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
- MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
- setOperationAction(ISD::MLOAD, VT, Custom);
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
+ setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom);
}
}
- setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
- setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
- setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
if (HasBWI) {
// Extends from v64i1 masks to 512-bit vectors.
- setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
- }
-
- for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
- setOperationAction(ISD::FFLOOR, VT, Legal);
- setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
- setOperationAction(ISD::FCEIL, VT, Legal);
- setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
- setOperationAction(ISD::FTRUNC, VT, Legal);
- setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
- setOperationAction(ISD::FRINT, VT, Legal);
- setOperationAction(ISD::STRICT_FRINT, VT, Legal);
- setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
+ }
+
+ for (auto VT : {MVT::v16f32, MVT::v8f64}) {
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
- setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+ setOperationAction(ISD::FROUNDEVEN, VT, Legal);
setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
- setOperationAction(ISD::FROUND, VT, Custom);
+ setOperationAction(ISD::FROUND, VT, Custom);
}
for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
@@ -1957,36 +1958,36 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
- setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
- setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
- setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v8i64, Custom);
setOperationAction(ISD::MUL, MVT::v16i32, Legal);
setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
- setOperationAction(ISD::MUL, MVT::v64i8, Custom);
+ setOperationAction(ISD::MUL, MVT::v64i8, Custom);
setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
- setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
- setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
+ setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
+ setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
- setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
- for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
- setOperationAction(ISD::ROTL, VT, Custom);
- setOperationAction(ISD::ROTR, VT, Custom);
- setOperationAction(ISD::SETCC, VT, Custom);
- setOperationAction(ISD::ABDS, VT, Custom);
- setOperationAction(ISD::ABDU, VT, Custom);
- setOperationAction(ISD::BITREVERSE, VT, Custom);
+ for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::ROTL, VT, Custom);
+ setOperationAction(ISD::ROTR, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::ABDS, VT, Custom);
+ setOperationAction(ISD::ABDU, VT, Custom);
+ setOperationAction(ISD::BITREVERSE, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
// setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1994,83 +1995,84 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setCondCodeAction(ISD::SETLE, VT, Custom);
}
- setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
- setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
- setOperationAction(ISD::STRICT_FSETCC, MVT::v8f64, Custom);
- setOperationAction(ISD::STRICT_FSETCC, MVT::v16f32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::v8f64, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::v16f32, Custom);
setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f64, Custom);
setOperationAction(ISD::STRICT_FSETCCS, MVT::v16f32, Custom);
- for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
- setOperationAction(ISD::SMAX, VT, Legal);
- setOperationAction(ISD::UMAX, VT, Legal);
- setOperationAction(ISD::SMIN, VT, Legal);
- setOperationAction(ISD::UMIN, VT, Legal);
- setOperationAction(ISD::ABS, VT, Legal);
- setOperationAction(ISD::CTPOP, VT, Custom);
- }
-
- for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
- setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
- setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
- setOperationAction(ISD::CTLZ, VT, Custom);
- setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
- setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
- setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
- setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
+ for (auto VT : {MVT::v16i32, MVT::v8i64}) {
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ setOperationAction(ISD::ABS, VT, Legal);
+ setOperationAction(ISD::CTPOP, VT, Custom);
+ }
+
+ for (auto VT : {MVT::v64i8, MVT::v32i16}) {
+ setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::CTPOP, VT,
+ Subtarget.hasBITALG() ? Legal : Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
}
- setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
- setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
- setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
- setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
- setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
- setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
+ setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
+ setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
+ setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
+ setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
+ setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
+ setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
if (Subtarget.hasDQI() || Subtarget.hasFP16())
for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
- setOperationAction(Opc, MVT::v8i64, Custom);
+ setOperationAction(Opc, MVT::v8i64, Custom);
if (Subtarget.hasDQI())
- setOperationAction(ISD::MUL, MVT::v8i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v8i64, Legal);
if (Subtarget.hasCDI()) {
// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
- for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
- setOperationAction(ISD::CTLZ, VT, Legal);
+ for (auto VT : {MVT::v16i32, MVT::v8i64}) {
+ setOperationAction(ISD::CTLZ, VT, Legal);
}
} // Subtarget.hasCDI()
if (Subtarget.hasVPOPCNTDQ()) {
- for (auto VT : { MVT::v16i32, MVT::v8i64 })
+ for (auto VT : {MVT::v16i32, MVT::v8i64})
setOperationAction(ISD::CTPOP, VT, Legal);
}
// Extract subvector is special because the value type
// (result) is 256-bit but the source is 512-bit wide.
// 128-bit was made Legal under AVX1.
- for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
- MVT::v16f16, MVT::v8f32, MVT::v4f64 })
+ for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+ MVT::v16f16, MVT::v8f32, MVT::v4f64})
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
- for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
- MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
- setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
- setOperationAction(ISD::SELECT, VT, Custom);
+ for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
+ MVT::v32f16, MVT::v16f32, MVT::v8f64}) {
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+ setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::CTSELECT, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Custom);
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
}
setF16Action(MVT::v32f16, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
@@ -2081,20 +2083,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
- for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
- setOperationAction(ISD::MLOAD, VT, Legal);
- setOperationAction(ISD::MSTORE, VT, Legal);
- setOperationAction(ISD::MGATHER, VT, Custom);
- setOperationAction(ISD::MSCATTER, VT, Custom);
+ for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64}) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
}
if (HasBWI) {
- for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
- setOperationAction(ISD::MLOAD, VT, Legal);
- setOperationAction(ISD::MSTORE, VT, Legal);
+ for (auto VT : {MVT::v64i8, MVT::v32i16}) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
}
} else {
setOperationAction(ISD::STORE, MVT::v32i16, Custom);
- setOperationAction(ISD::STORE, MVT::v64i8, Custom);
+ setOperationAction(ISD::STORE, MVT::v64i8, Custom);
}
if (Subtarget.hasVBMI2()) {
@@ -2110,7 +2112,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
setOperationAction(ISD::FABS, MVT::v32f16, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::v32f16, Custom);
- }// useAVX512Regs
+ } // useAVX512Regs
if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
@@ -2127,9 +2129,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
if (Subtarget.hasDQI()) {
// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
@@ -2138,31 +2140,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
"Unexpected operation action!");
// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
- setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
}
- for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
+ for (auto VT : {MVT::v2i64, MVT::v4i64}) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
- setOperationAction(ISD::ABS, VT, Legal);
+ setOperationAction(ISD::ABS, VT, Legal);
}
- for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
- setOperationAction(ISD::ROTL, VT, Custom);
- setOperationAction(ISD::ROTR, VT, Custom);
+ for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64}) {
+ setOperationAction(ISD::ROTL, VT, Custom);
+ setOperationAction(ISD::ROTR, VT, Custom);
}
// Custom legalize 2x32 to get a little better code.
setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
- for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
- MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
+ for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32,
+ MVT::v8f32, MVT::v2f64, MVT::v4f64})
setOperationAction(ISD::MSCATTER, VT, Custom);
if (Subtarget.hasDQI()) {
@@ -2177,13 +2179,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (Subtarget.hasCDI()) {
- for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
- setOperationAction(ISD::CTLZ, VT, Legal);
+ for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64}) {
+ setOperationAction(ISD::CTLZ, VT, Legal);
}
} // Subtarget.hasCDI()
if (Subtarget.hasVPOPCNTDQ()) {
- for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
+ for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64})
setOperationAction(ISD::CTPOP, VT, Legal);
}
@@ -2220,34 +2222,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// This block control legalization of v32i1/v64i1 which are available with
// AVX512BW..
if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
- addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
- addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
+ addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
+ addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
- for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
- setOperationAction(ISD::VSELECT, VT, Expand);
- setOperationAction(ISD::TRUNCATE, VT, Custom);
- setOperationAction(ISD::SETCC, VT, Custom);
+ for (auto VT : {MVT::v32i1, MVT::v64i1}) {
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::CTSELECT, VT, Custom);
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
}
- for (auto VT : { MVT::v16i1, MVT::v32i1 })
+ for (auto VT : {MVT::v16i1, MVT::v32i1})
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
// Extends from v32i1 masks to 256-bit vectors.
- setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
MVT::v16f16, MVT::v8f16}) {
- setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
}
@@ -2256,120 +2258,120 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
if (Subtarget.hasBITALG()) {
- for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
+ for (auto VT : {MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16})
setOperationAction(ISD::CTPOP, VT, Legal);
}
}
if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
- auto setGroup = [&] (MVT VT) {
- setOperationAction(ISD::FADD, VT, Legal);
- setOperationAction(ISD::STRICT_FADD, VT, Legal);
- setOperationAction(ISD::FSUB, VT, Legal);
- setOperationAction(ISD::STRICT_FSUB, VT, Legal);
- setOperationAction(ISD::FMUL, VT, Legal);
- setOperationAction(ISD::STRICT_FMUL, VT, Legal);
- setOperationAction(ISD::FDIV, VT, Legal);
- setOperationAction(ISD::STRICT_FDIV, VT, Legal);
- setOperationAction(ISD::FSQRT, VT, Legal);
- setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
-
- setOperationAction(ISD::FFLOOR, VT, Legal);
- setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
- setOperationAction(ISD::FCEIL, VT, Legal);
- setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
- setOperationAction(ISD::FTRUNC, VT, Legal);
- setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
- setOperationAction(ISD::FRINT, VT, Legal);
- setOperationAction(ISD::STRICT_FRINT, VT, Legal);
- setOperationAction(ISD::FNEARBYINT, VT, Legal);
- setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+ auto setGroup = [&](MVT VT) {
+ setOperationAction(ISD::FADD, VT, Legal);
+ setOperationAction(ISD::STRICT_FADD, VT, Legal);
+ setOperationAction(ISD::FSUB, VT, Legal);
+ setOperationAction(ISD::STRICT_FSUB, VT, Legal);
+ setOperationAction(ISD::FMUL, VT, Legal);
+ setOperationAction(ISD::STRICT_FMUL, VT, Legal);
+ setOperationAction(ISD::FDIV, VT, Legal);
+ setOperationAction(ISD::STRICT_FDIV, VT, Legal);
+ setOperationAction(ISD::FSQRT, VT, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
+
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
setOperationAction(ISD::FROUNDEVEN, VT, Legal);
setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
- setOperationAction(ISD::FROUND, VT, Custom);
+ setOperationAction(ISD::FROUND, VT, Custom);
- setOperationAction(ISD::LOAD, VT, Legal);
- setOperationAction(ISD::STORE, VT, Legal);
+ setOperationAction(ISD::LOAD, VT, Legal);
+ setOperationAction(ISD::STORE, VT, Legal);
- setOperationAction(ISD::FMA, VT, Legal);
- setOperationAction(ISD::STRICT_FMA, VT, Legal);
- setOperationAction(ISD::VSELECT, VT, Legal);
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Legal);
+ setOperationAction(ISD::STRICT_FMA, VT, Legal);
+ setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::CTSELECT, VT, Custom);
- setOperationAction(ISD::FNEG, VT, Custom);
- setOperationAction(ISD::FABS, VT, Custom);
- setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::SETCC, VT, Custom);
- setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
- setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
};
// AVX512_FP16 scalar operations
setGroup(MVT::f16);
- setOperationAction(ISD::FREM, MVT::f16, Promote);
- setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
- setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
- setOperationAction(ISD::BR_CC, MVT::f16, Expand);
- setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
- setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
- setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
- setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
- setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
- setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
- setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
- setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
- setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
- setOperationAction(ISD::LRINT, MVT::f16, Legal);
- setOperationAction(ISD::LLRINT, MVT::f16, Legal);
+ setOperationAction(ISD::FREM, MVT::f16, Promote);
+ setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
+ setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f16, Expand);
+ setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
+ setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
+ setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
+ setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
+ setOperationAction(ISD::LRINT, MVT::f16, Legal);
+ setOperationAction(ISD::LLRINT, MVT::f16, Legal);
setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
if (Subtarget.useAVX512Regs()) {
setGroup(MVT::v32f16);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
- setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
- setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
- setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
-
- setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
+ setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
MVT::v32i16);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
MVT::v32i16);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
MVT::v32i16);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
MVT::v32i16);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
@@ -2380,40 +2382,40 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
}
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
if (Subtarget.hasVLX()) {
setGroup(MVT::v8f16);
setGroup(MVT::v16f16);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
- setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
- setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
-
- setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
- setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
- setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
+
+ setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
// INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
- setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
@@ -2421,7 +2423,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
// Need to custom widen these to prevent scalarization.
- setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
setOperationAction(ISD::STORE, MVT::v4f16, Custom);
setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
@@ -2514,52 +2516,52 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
- setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
- setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
- setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
- setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
if (Subtarget.hasBWI()) {
- setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
- setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
+ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
}
if (Subtarget.hasFP16()) {
// vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
- setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
// vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
- setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
// vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
- setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
- setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
// vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
- setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
- setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
- setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
}
}
@@ -2597,7 +2599,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// FIXME: We really should do custom legalization for addition and
// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
// than generic legalization for 64-bit multiplication-with-overflow, though.
- for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
continue;
// Add/Sub/Mul with overflow operations are custom lowered.
@@ -2881,8 +2883,9 @@ static bool isLogicOp(unsigned Opcode) {
}
static bool isTargetShuffle(unsigned Opcode) {
- switch(Opcode) {
- default: return false;
+ switch (Opcode) {
+ default:
+ return false;
case X86ISD::BLENDI:
case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
@@ -2923,7 +2926,8 @@ static bool isTargetShuffle(unsigned Opcode) {
static bool isTargetShuffleVariableMask(unsigned Opcode) {
switch (Opcode) {
- default: return false;
+ default:
+ return false;
// Target Shuffles.
case X86ISD::PSHUFB:
case X86ISD::VPERMILPV:
@@ -2949,9 +2953,8 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
if (ReturnAddrIndex == 0) {
// Set up a frame object for the return address.
unsigned SlotSize = RegInfo->getSlotSize();
- ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
- -(int64_t)SlotSize,
- false);
+ ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(
+ SlotSize, -(int64_t)SlotSize, false);
FuncInfo->setRAIndex(ReturnAddrIndex);
}
@@ -3009,7 +3012,7 @@ static bool isX86CCSigned(X86::CondCode X86CC) {
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
switch (SetCCOpcode) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("Invalid integer condition!");
case ISD::SETEQ: return X86::COND_E;
case ISD::SETGT: return X86::COND_G;
@@ -3021,7 +3024,7 @@ static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
case ISD::SETUGT: return X86::COND_A;
case ISD::SETULE: return X86::COND_BE;
case ISD::SETUGE: return X86::COND_AE;
- // clang-format on
+ // clang-format on
}
}
@@ -3059,14 +3062,14 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
// First determine if it is required or is profitable to flip the operands.
// If LHS is a foldable load, but RHS is not, flip the condition.
- if (ISD::isNON_EXTLoad(LHS.getNode()) &&
- !ISD::isNON_EXTLoad(RHS.getNode())) {
+ if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) {
SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
std::swap(LHS, RHS);
}
switch (SetCCOpcode) {
- default: break;
+ default:
+ break;
case ISD::SETOLT:
case ISD::SETOLE:
case ISD::SETUGT:
@@ -3082,7 +3085,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
// 1 | 0 | 0 | X == Y
// 1 | 1 | 1 | unordered
switch (SetCCOpcode) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("Condcode should be pre-legalized away");
case ISD::SETUEQ:
case ISD::SETEQ: return X86::COND_E;
@@ -3104,7 +3107,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
case ISD::SETO: return X86::COND_NP;
case ISD::SETOEQ:
case ISD::SETUNE: return X86::COND_INVALID;
- // clang-format on
+ // clang-format on
}
}
@@ -3139,7 +3142,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MONone;
Info.offset = 0;
- const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
+ const IntrinsicData *IntrData = getIntrinsicWithChain(Intrinsic);
if (!IntrData) {
switch (Intrinsic) {
case Intrinsic::x86_aesenc128kl:
@@ -3232,7 +3235,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case TRUNCATE_TO_MEM_VI32: {
Info.opc = ISD::INTRINSIC_VOID;
Info.ptrVal = I.getArgOperand(0);
- MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
+ MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
ScalarVT = MVT::i8;
@@ -3252,8 +3255,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = nullptr;
MVT DataVT = MVT::getVT(I.getType());
MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
- unsigned NumElts = std::min(DataVT.getVectorNumElements(),
- IndexVT.getVectorNumElements());
+ unsigned NumElts =
+ std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
Info.align = Align(1);
Info.flags |= MachineMemOperand::MOLoad;
@@ -3264,8 +3267,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = nullptr;
MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
- unsigned NumElts = std::min(DataVT.getVectorNumElements(),
- IndexVT.getVectorNumElements());
+ unsigned NumElts =
+ std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
Info.align = Align(1);
Info.flags |= MachineMemOperand::MOStore;
@@ -3424,8 +3427,9 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
// Mask vectors support all subregister combinations and operations that
// extract half of vector.
if (ResVT.getVectorElementType() == MVT::i1)
- return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
- (Index == ResVT.getVectorNumElements()));
+ return Index == 0 ||
+ ((ResVT.getSizeInBits() == SrcVT.getSizeInBits() * 2) &&
+ (Index == ResVT.getVectorNumElements()));
return (Index % ResVT.getVectorNumElements()) == 0;
}
@@ -3485,9 +3489,9 @@ bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
(VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
}
-bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
- const SelectionDAG &DAG,
- const MachineMemOperand &MMO) const {
+bool X86TargetLowering::isLoadBitCastBeneficial(
+ EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const {
if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
BitcastVT.getVectorElementType() == MVT::i1)
return false;
@@ -3496,8 +3500,8 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
return false;
// If both types are legal vectors, it's always ok to convert them.
- if (LoadVT.isVector() && BitcastVT.isVector() &&
- isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
+ if (LoadVT.isVector() && BitcastVT.isVector() && isTypeLegal(LoadVT) &&
+ isTypeLegal(BitcastVT))
return true;
return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
@@ -3521,9 +3525,7 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
return true;
}
-bool X86TargetLowering::isCtlzFast() const {
- return Subtarget.hasFastLZCNT();
-}
+bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); }
bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
const Instruction &AndI) const {
@@ -3952,8 +3954,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
return true;
}
-static bool canWidenShuffleElements(ArrayRef<int> Mask,
- const APInt &Zeroable,
+static bool canWidenShuffleElements(ArrayRef<int> Mask, const APInt &Zeroable,
bool V2IsZero,
SmallVectorImpl<int> &WidenedMask) {
// Create an alternative mask with info about zeroable elements.
@@ -4037,7 +4038,7 @@ bool X86::isZeroNode(SDValue Elt) {
static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
const SDLoc &dl, bool IsMask = false) {
- SmallVector<SDValue, 32> Ops;
+ SmallVector<SDValue, 32> Ops;
bool Split = false;
MVT ConstVecVT = VT;
@@ -4051,12 +4052,12 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
MVT EltVT = ConstVecVT.getVectorElementType();
for (unsigned i = 0; i < NumElts; ++i) {
bool IsUndef = Values[i] < 0 && IsMask;
- SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
- DAG.getConstant(Values[i], dl, EltVT);
+ SDValue OpNode =
+ IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(Values[i], dl, EltVT);
Ops.push_back(OpNode);
if (Split)
- Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
- DAG.getConstant(0, dl, EltVT));
+ Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT)
+ : DAG.getConstant(0, dl, EltVT));
}
SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
if (Split)
@@ -4064,8 +4065,8 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
return ConstsNode;
}
-static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
- MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
+static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs, MVT VT,
+ SelectionDAG &DAG, const SDLoc &dl) {
assert(Bits.size() == Undefs.getBitWidth() &&
"Unequal constant and undef arrays");
SmallVector<SDValue, 32> Ops;
@@ -4100,8 +4101,8 @@ static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
return DAG.getBitcast(VT, ConstsNode);
}
-static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT,
- SelectionDAG &DAG, const SDLoc &dl) {
+static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT, SelectionDAG &DAG,
+ const SDLoc &dl) {
APInt Undefs = APInt::getZero(Bits.size());
return getConstVector(Bits, Undefs, VT, DAG, dl);
}
@@ -4638,8 +4639,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
// May need to promote to a legal type.
Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- DAG.getConstant(0, dl, WideOpVT),
- SubVec, Idx);
+ DAG.getConstant(0, dl, WideOpVT), SubVec, Idx);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
@@ -4654,20 +4654,18 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
if (IdxVal == 0) {
// Zero lower bits of the Vec
SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
- ZeroIdx);
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
// Merge them together, SubVec should be zero extended.
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- DAG.getConstant(0, dl, WideOpVT),
- SubVec, ZeroIdx);
+ DAG.getConstant(0, dl, WideOpVT), SubVec, ZeroIdx);
Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
- SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- Undef, SubVec, ZeroIdx);
+ SubVec =
+ DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, SubVec, ZeroIdx);
if (Vec.isUndef()) {
assert(IdxVal != 0 && "Unexpected index");
@@ -4705,12 +4703,11 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
// isel to optimize when bits are known zero.
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- DAG.getConstant(0, dl, WideOpVT),
- Vec, ZeroIdx);
+ DAG.getConstant(0, dl, WideOpVT), Vec, ZeroIdx);
} else {
// Otherwise use explicit shifts to zero the bits.
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- Undef, Vec, ZeroIdx);
+ Vec =
+ DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
NumElems = WideOpVT.getVectorNumElements();
SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
@@ -4763,9 +4760,9 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
// Isolate the bits after the last inserted bit.
unsigned HighShift = IdxVal + SubVecNumElems;
SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
- DAG.getTargetConstant(HighShift, dl, MVT::i8));
+ DAG.getTargetConstant(HighShift, dl, MVT::i8));
High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
- DAG.getTargetConstant(HighShift, dl, MVT::i8));
+ DAG.getTargetConstant(HighShift, dl, MVT::i8));
// Now OR all 3 pieces together.
Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
@@ -4846,8 +4843,8 @@ static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
}
-void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
- bool Lo, bool Unary) {
+void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+ bool Unary) {
assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
"Illegal vector type to unpack");
assert(Mask.empty() && "Expected an empty shuffle mask vector");
@@ -4984,13 +4981,12 @@ static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
/// This produces a shuffle where the low element of V2 is swizzled into the
/// zero/undef vector, landing at element Idx.
/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
-static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
- bool IsZero,
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = V2.getSimpleValueType();
- SDValue V1 = IsZero
- ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
+ SDValue V1 =
+ IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
int NumElems = VT.getVectorNumElements();
SmallVector<int, 16> MaskVec(NumElems);
for (int i = 0; i != NumElems; ++i)
@@ -8568,7 +8564,7 @@ static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
case ISD::FADD: HOpcode = X86ISD::FHADD; break;
case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
default: return false;
- // clang-format on
+ // clang-format on
}
}
@@ -8598,8 +8594,7 @@ static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
- unsigned ExpectedIndex = i * NumEltsIn128Bits +
- (j % NumEltsIn64Bits) * 2;
+ unsigned ExpectedIndex = i * NumEltsIn128Bits + (j % NumEltsIn64Bits) * 2;
if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
continue;
@@ -9249,8 +9244,8 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL,
return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
}
-SDValue
-X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
SDLoc dl(Op);
MVT VT = Op.getSimpleValueType();
@@ -9474,14 +9469,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
}
// Is it a vector logical left shift?
- if (NumElems == 2 && Idx == 1 &&
- X86::isZeroNode(Op.getOperand(0)) &&
+ if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) &&
!X86::isZeroNode(Op.getOperand(1))) {
unsigned NumBits = VT.getSizeInBits();
- return getVShift(true, VT,
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
- VT, Op.getOperand(1)),
- NumBits/2, DAG, *this, dl);
+ return getVShift(
+ true, VT,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)),
+ NumBits / 2, DAG, *this, dl);
}
if (IsAllConstants) // Otherwise, it's better to do a constpool load.
@@ -9494,7 +9488,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// place.
if (EVTBits == 32) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
- return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
+ return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget,
+ DAG);
}
}
@@ -9533,8 +9528,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// build_vector and broadcast it.
// TODO: We could probably generalize this more.
if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
- SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
- DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
+ SDValue Ops[4] = {Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT),
+ DAG.getUNDEF(EltVT)};
auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
// Make sure all the even/odd operands match.
for (unsigned i = 2; i != NumElems; ++i)
@@ -9550,8 +9545,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
DAG.getBuildVector(NarrowVT, dl, Ops));
// Broadcast from v2i64/v2f64 and cast to final VT.
MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
- return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
- NewBV));
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, NewBV));
}
}
@@ -9564,7 +9559,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SDValue Lower =
DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
SDValue Upper = DAG.getBuildVector(
- HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
+ HVT, dl, Op->ops().slice(NumElems / 2, NumElems / 2));
// Recreate the wider vector with the lower and upper part.
return concatSubVectors(Lower, Upper, DAG, dl);
@@ -9575,8 +9570,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (NumNonZero == 1) {
// One half is zero or undef.
unsigned Idx = NonZeroMask.countr_zero();
- SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
- Op.getOperand(Idx));
+ SDValue V2 =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx));
return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
}
return SDValue();
@@ -9611,30 +9606,28 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
for (unsigned i = 0; i < 2; ++i) {
switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
- default: llvm_unreachable("Unexpected NonZero count");
- case 0:
- Ops[i] = Ops[i*2]; // Must be a zero vector.
- break;
- case 1:
- Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
- break;
- case 2:
- Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
- break;
- case 3:
- Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
- break;
+ default:
+ llvm_unreachable("Unexpected NonZero count");
+ case 0:
+ Ops[i] = Ops[i * 2]; // Must be a zero vector.
+ break;
+ case 1:
+ Ops[i] = getMOVL(DAG, dl, VT, Ops[i * 2 + 1], Ops[i * 2]);
+ break;
+ case 2:
+ Ops[i] = getMOVL(DAG, dl, VT, Ops[i * 2], Ops[i * 2 + 1]);
+ break;
+ case 3:
+ Ops[i] = getUnpackl(DAG, dl, VT, Ops[i * 2], Ops[i * 2 + 1]);
+ break;
}
}
bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
- int MaskVec[] = {
- Reverse1 ? 1 : 0,
- Reverse1 ? 0 : 1,
- static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
- static_cast<int>(Reverse2 ? NumElems : NumElems+1)
- };
+ int MaskVec[] = {Reverse1 ? 1 : 0, Reverse1 ? 0 : 1,
+ static_cast<int>(Reverse2 ? NumElems + 1 : NumElems),
+ static_cast<int>(Reverse2 ? NumElems : NumElems + 1)};
return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
}
@@ -9653,7 +9646,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
Result = DAG.getUNDEF(VT);
for (unsigned i = 1; i < NumElems; ++i) {
- if (Op.getOperand(i).isUndef()) continue;
+ if (Op.getOperand(i).isUndef())
+ continue;
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
}
@@ -9678,14 +9672,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
// Generate scaled UNPCKL shuffle mask.
SmallVector<int, 16> Mask;
- for(unsigned i = 0; i != Scale; ++i)
+ for (unsigned i = 0; i != Scale; ++i)
Mask.push_back(i);
for (unsigned i = 0; i != Scale; ++i)
- Mask.push_back(NumElems+i);
+ Mask.push_back(NumElems + i);
Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
- Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
+ Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2 * i], Ops[(2 * i) + 1], Mask);
}
return Ops[0];
}
@@ -9711,15 +9705,14 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl,
if (SubVec.isUndef())
continue;
if (ISD::isFreezeUndef(SubVec.getNode())) {
- // If the freeze(undef) has multiple uses then we must fold to zero.
- if (SubVec.hasOneUse()) {
- ++NumFreezeUndef;
- } else {
- ++NumZero;
- Undefs.insert(SubVec);
- }
- }
- else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+ // If the freeze(undef) has multiple uses then we must fold to zero.
+ if (SubVec.hasOneUse()) {
+ ++NumFreezeUndef;
+ } else {
+ ++NumZero;
+ Undefs.insert(SubVec);
+ }
+ } else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
++NumZero;
else {
assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
@@ -9733,9 +9726,9 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl,
MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
ArrayRef<SDUse> Ops = Op->ops();
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
- Ops.slice(0, NumOperands/2));
+ Ops.slice(0, NumOperands / 2));
SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
- Ops.slice(NumOperands/2));
+ Ops.slice(NumOperands / 2));
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
}
@@ -9768,7 +9761,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl,
// TODO: Merge this with LowerAVXCONCAT_VECTORS?
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl,
const X86Subtarget &Subtarget,
- SelectionDAG & DAG) {
+ SelectionDAG &DAG) {
MVT ResVT = Op.getSimpleValueType();
unsigned NumOperands = Op.getNumOperands();
assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
@@ -9839,8 +9832,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl,
DAG.getVectorIdxConstant(NumElems / 2, dl));
}
-static SDValue LowerCONCAT_VECTORS(SDValue Op,
- const X86Subtarget &Subtarget,
+static SDValue LowerCONCAT_VECTORS(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
@@ -10062,8 +10054,8 @@ static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
// Ok, handle the in-lane shuffles by detecting if and when they repeat.
// Adjust second vector indices to start at LaneSize instead of Size.
- int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
- : Mask[i] % LaneSize + LaneSize;
+ int LocalM =
+ Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
if (RepeatedMask[i % LaneSize] < 0)
// This is the first non-undef entry in this slot of a 128-bit lane.
RepeatedMask[i % LaneSize] = LocalM;
@@ -10081,8 +10073,7 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
}
-static bool
-is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
+static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
SmallVector<int, 32> RepeatedMask;
return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
}
@@ -10381,8 +10372,8 @@ static SDValue getSHUFPDImmForMask(ArrayRef<int> Mask, const SDLoc &DL,
//
// The function looks for a sub-mask that the nonzero elements are in
// increasing order. If such sub-mask exist. The function returns true.
-static bool isNonZeroElementsInOrder(const APInt &Zeroable,
- ArrayRef<int> Mask, const EVT &VectorType,
+static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef<int> Mask,
+ const EVT &VectorType,
bool &IsZeroSideLeft) {
int NextElement = -1;
// Check if the Mask's nonzero elements are in increasing order.
@@ -11162,7 +11153,7 @@ static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,
if (M == SM_SentinelUndef)
continue;
if (M == Elt || (0 <= M && M < NumElts &&
- IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
+ IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
Mask[Elt] = Elt;
LaneV1InUse = true;
continue;
@@ -11295,8 +11286,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
// If we have VPTERNLOG, we can use that as a bit blend.
if (Subtarget.hasVLX())
- if (SDValue BitBlend =
- lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ if (SDValue BitBlend = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return BitBlend;
// Scale the blend by the number of bytes per element.
@@ -11604,9 +11594,11 @@ static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
/// permuting the elements of the result in place.
-static SDValue lowerShuffleAsByteRotateAndPermute(
- const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
(VT.is256BitVector() && !Subtarget.hasAVX2()) ||
(VT.is512BitVector() && !Subtarget.hasBWI()))
@@ -11804,9 +11796,9 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
// If either input vector provides only a single element which is repeated
// multiple times, unpacking from both input vectors would generate worse
// code. e.g. for
- // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
- // it is better to process t4 first to create a vector of t4[0], then unpack
- // that vector with t2.
+ // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2,
+ // t4 it is better to process t4 first to create a vector of t4[0], then
+ // unpack that vector with t2.
if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
!isSingleElementRepeatedMask(V2Mask))
if (SDValue UnpackPerm =
@@ -11818,8 +11810,8 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
return RotatePerm;
// Unpack/rotate failed - try again with variable blends.
- if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
- DAG))
+ if (SDValue BlendPerm =
+ lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
return BlendPerm;
if (VT.getScalarSizeInBits() >= 32)
@@ -11933,7 +11925,7 @@ static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
SDValue Lo, Hi;
for (int i = 0; i < NumElts; ++i) {
int M = Mask[i];
- assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
+ assert((M == SM_SentinelUndef || (0 <= M && M < (2 * NumElts))) &&
"Unexpected mask index.");
if (M < 0)
continue;
@@ -12055,8 +12047,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
"Rotate-based lowering only supports 128-bit lowering!");
assert(Mask.size() <= 16 &&
"Can shuffle at most 16 bytes in a 128-bit vector!");
- assert(ByteVT == MVT::v16i8 &&
- "SSE2 rotate lowering only needed for v16i8!");
+ assert(ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!");
// Default SSE2 implementation
int LoByteShift = 16 - ByteRotation;
@@ -12091,8 +12082,9 @@ static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
"Only 32-bit and 64-bit elements are supported!");
// 128/256-bit vectors are only supported with VLX.
- assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
- && "VLX required for 128/256-bit vectors");
+ assert(
+ (Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) &&
+ "VLX required for 128/256-bit vectors");
SDValue Lo = V1, Hi = V2;
int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
@@ -12644,8 +12636,7 @@ static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT,
/// are both incredibly common and often quite performance sensitive.
static SDValue lowerShuffleAsZeroOrAnyExtend(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+ const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
int Bits = VT.getSizeInBits();
int NumLanes = Bits / 128;
int NumElements = VT.getVectorNumElements();
@@ -12771,7 +12762,8 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
// If the bitcasts shift the element size, we can't extract an equivalent
// element from it.
MVT NewVT = V.getSimpleValueType();
- if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+ if (!NewVT.isVector() ||
+ NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
return SDValue();
if (V.getOpcode() == ISD::BUILD_VECTOR ||
@@ -12795,7 +12787,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
}
-template<typename T>
+template <typename T>
static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
T EltVT = VT.getScalarType();
return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
@@ -12808,8 +12800,7 @@ static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
/// across all subtarget feature sets.
static SDValue lowerShuffleAsElementInsertion(
const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+ const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
MVT ExtVT = VT;
MVT EltVT = VT.getVectorElementType();
unsigned NumElts = VT.getVectorNumElements();
@@ -12842,8 +12833,8 @@ static SDValue lowerShuffleAsElementInsertion(
// all the smarts here sunk into that routine. However, the current
// lowering of BUILD_VECTOR makes that nearly impossible until the old
// vector shuffle lowering is dead.
- SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
- DAG);
+ SDValue V2S =
+ getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), DAG);
if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
// We need to zext the scalar if it is smaller than an i32.
V2S = DAG.getBitcast(EltVT, V2S);
@@ -13046,8 +13037,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
// Check that both sources are extracts of the same source vector.
if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
- N0.getOperand(0) != N1.getOperand(0) ||
- !N0.hasOneUse() || !N1.hasOneUse())
+ N0.getOperand(0) != N1.getOperand(0) || !N0.hasOneUse() ||
+ !N1.hasOneUse())
return SDValue();
SDValue WideVec = N0.getOperand(0);
@@ -13077,8 +13068,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
NewMask.append(NumElts, -1);
// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
- SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
- NewMask);
+ SDValue Shuf =
+ DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), NewMask);
// This is free: ymm -> xmm.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
DAG.getVectorIdxConstant(0, DL));
@@ -13277,8 +13268,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
if (!V.getValueType().isVector()) {
assert(V.getScalarValueSizeInBits() == NumEltBits &&
"Unexpected scalar size");
- MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
- VT.getVectorNumElements());
+ MVT BroadcastVT =
+ MVT::getVectorVT(V.getSimpleValueType(), VT.getVectorNumElements());
return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
}
@@ -13303,8 +13294,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// elements are zeroable.
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
unsigned &InsertPSMask,
- const APInt &Zeroable,
- ArrayRef<int> Mask, SelectionDAG &DAG) {
+ const APInt &Zeroable, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
@@ -13756,8 +13747,8 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// when the V2 input is targeting element 0 of the mask -- that is the fast
// case here.
if (NumV2Elements == 1 && Mask[0] >= 4)
- if (SDValue V = lowerShuffleAsElementInsertion(
- DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return V;
if (Subtarget.hasSSE41()) {
@@ -13766,8 +13757,8 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
if (!isSingleSHUFPSMask(Mask))
- if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
- V2, Mask, DAG))
+ if (SDValue BlendPerm =
+ lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG))
return BlendPerm;
}
@@ -13859,8 +13850,8 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerShuffleAsElementInsertion(
- DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return V;
// We have different paths for blend lowering, but they all must use the
@@ -13990,7 +13981,7 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
};
if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
- int PSHUFDMask[4] = { -1, -1, -1, -1 };
+ int PSHUFDMask[4] = {-1, -1, -1, -1};
SmallVector<std::pair<int, int>, 4> DWordPairs;
int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
@@ -14094,7 +14085,8 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
int TripleNonInputIdx =
- TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
+ TripleInputSum -
+ std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
TripleDWord = TripleNonInputIdx / 2;
// We use xor with one to compute the adjacent DWord to whichever one the
@@ -14172,9 +14164,9 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
// Adjust the mask to match the new locations of A and B.
for (int &M : Mask)
- if (M >= 0 && M/2 == ADWord)
+ if (M >= 0 && M / 2 == ADWord)
M = 2 * BDWord + M % 2;
- else if (M >= 0 && M/2 == BDWord)
+ else if (M >= 0 && M / 2 == BDWord)
M = 2 * ADWord + M % 2;
// Recurse back into this routine to re-compute state now that this isn't
@@ -14202,33 +14194,33 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
MutableArrayRef<int> SourceHalfMask,
MutableArrayRef<int> HalfMask, int HalfOffset) {
- if (InPlaceInputs.empty())
- return;
- if (InPlaceInputs.size() == 1) {
- SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
- InPlaceInputs[0] - HalfOffset;
- PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
- return;
- }
- if (IncomingInputs.empty()) {
- // Just fix all of the in place inputs.
- for (int Input : InPlaceInputs) {
- SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
- PSHUFDMask[Input / 2] = Input / 2;
- }
- return;
- }
+ if (InPlaceInputs.empty())
+ return;
+ if (InPlaceInputs.size() == 1) {
+ SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+ InPlaceInputs[0] - HalfOffset;
+ PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
+ return;
+ }
+ if (IncomingInputs.empty()) {
+ // Just fix all of the in place inputs.
+ for (int Input : InPlaceInputs) {
+ SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+ PSHUFDMask[Input / 2] = Input / 2;
+ }
+ return;
+ }
- assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
- SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
- InPlaceInputs[0] - HalfOffset;
- // Put the second input next to the first so that they are packed into
- // a dword. We find the adjacent index by toggling the low bit.
- int AdjIndex = InPlaceInputs[0] ^ 1;
- SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
- llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
- PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
- };
+ assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
+ SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+ InPlaceInputs[0] - HalfOffset;
+ // Put the second input next to the first so that they are packed into
+ // a dword. We find the adjacent index by toggling the low bit.
+ int AdjIndex = InPlaceInputs[0] ^ 1;
+ SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
+ llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
+ PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
+ };
fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
@@ -14237,10 +14229,12 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
// FIXME: This operation could almost certainly be simplified dramatically to
// look more like the 3-1 fixing operation.
auto moveInputsToRightHalf = [&PSHUFDMask](
- MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
- MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
- MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
- int DestOffset) {
+ MutableArrayRef<int> IncomingInputs,
+ ArrayRef<int> ExistingInputs,
+ MutableArrayRef<int> SourceHalfMask,
+ MutableArrayRef<int> HalfMask,
+ MutableArrayRef<int> FinalSourceHalfMask,
+ int SourceOffset, int DestOffset) {
auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
};
@@ -14436,9 +14430,11 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
/// blend if only one input is used.
-static SDValue lowerShuffleAsBlendOfPSHUFBs(
- const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
+static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ SelectionDAG &DAG, bool &V1InUse,
+ bool &V2InUse) {
assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
"Lane crossing shuffle masks not supported");
@@ -14533,8 +14529,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Broadcast;
// Try to use bit rotation instructions.
- if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
- Subtarget, DAG))
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask, Subtarget, DAG))
return Rotate;
// Use dedicated unpack instructions for masks that match their pattern.
@@ -14569,14 +14565,14 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
- if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue V =
+ lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG))
return V;
// There are special ways we can lower some single-element blends.
if (NumV2Inputs == 1)
- if (SDValue V = lowerShuffleAsElementInsertion(
- DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return V;
// We have different paths for blend lowering, but they all must use the
@@ -14692,8 +14688,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// can both shuffle and set up the inefficient blend.
if (!IsBlendSupported && Subtarget.hasSSSE3()) {
bool V1InUse, V2InUse;
- return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
- Zeroable, DAG, V1InUse, V2InUse);
+ return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+ DAG, V1InUse, V2InUse);
}
// We can always bit-blend if we have to so the fallback strategy is to
@@ -14826,8 +14822,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// See if we can use SSE4A Extraction / Insertion.
if (Subtarget.hasSSE4A())
- if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
- Zeroable, DAG))
+ if (SDValue V =
+ lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG))
return V;
int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
@@ -14840,8 +14836,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Broadcast;
// Try to use bit rotation instructions.
- if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
- Subtarget, DAG))
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask, Subtarget, DAG))
return Rotate;
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
@@ -14882,7 +14878,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
SmallDenseMap<int, int, 8> LaneMap;
for (int I : InPlaceInputs) {
- PreDupI16Shuffle[I/2] = I/2;
+ PreDupI16Shuffle[I / 2] = I / 2;
LaneMap[I] = I;
}
int j = TargetLo ? 0 : 4, je = j + 4;
@@ -14896,7 +14892,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
++j;
if (j == je)
- // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+ // We can't place the inputs into a single half with a simple i16
+ // shuffle, so bail.
return SDValue();
// Map this input with the i16 shuffle.
@@ -15017,8 +15014,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Use PALIGNR+Permute if possible - permute might become PSHUFB but the
// PALIGNR will be cheaper than the second PSHUFB+OR.
- if (SDValue V = lowerShuffleAsByteRotateAndPermute(
- DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v16i8, V1, V2,
+ Mask, Subtarget, DAG))
return V;
}
@@ -15027,8 +15024,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are special ways we can lower some single-element blends.
if (NumV2Elements == 1)
- if (SDValue V = lowerShuffleAsElementInsertion(
- DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return V;
if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
@@ -15120,8 +15117,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (M >= 0)
M /= 2;
} else {
- // Otherwise just unpack the low half of V into VLoHalf and the high half into
- // VHiHalf so that we can blend them as i16s.
+ // Otherwise just unpack the low half of V into VLoHalf and the high half
+ // into VHiHalf so that we can blend them as i16s.
SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
VLoHalf = DAG.getBitcast(
@@ -15130,8 +15127,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
}
- SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
- SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
+ SDValue LoV =
+ DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+ SDValue HiV =
+ DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
}
@@ -15140,9 +15139,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
///
/// This routine breaks down the specific type of 128-bit shuffle and
/// dispatches to the lowering routines accordingly.
-static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
+static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2, const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
if (VT == MVT::v8bf16) {
@@ -15324,7 +15322,7 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(!V2.isUndef() && "This routine must not be used to lower single-input "
- "shuffles as it could then recurse on itself.");
+ "shuffles as it could then recurse on itself.");
int Size = Mask.size();
// If this can be modeled as a broadcast of two elements followed by a blend,
@@ -15663,8 +15661,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
// instruction bytes needed to explicitly generate the zero vector.
// Blends are faster and handle all the non-lane-crossing cases.
- if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
- Subtarget, DAG))
+ if (SDValue Blend =
+ lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Blend;
// If either input operand is a zero vector, use VPERM2X128 because its mask
@@ -15690,8 +15688,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
// Try to use SHUF128 if possible.
if (Subtarget.hasVLX()) {
if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
- unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
- ((WidenedMask[1] % 2) << 1);
+ unsigned PermMask =
+ ((WidenedMask[0] % 2) << 0) | ((WidenedMask[1] % 2) << 1);
return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
DAG.getTargetConstant(PermMask, DL, MVT::i8));
}
@@ -15715,7 +15713,7 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
(WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
unsigned PermMask = 0;
- PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
+ PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
// Check the immediate mask and replace unused sources with undef.
@@ -15907,9 +15905,9 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
/// adjusted to access the extracted halves of the original shuffle operands is
/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
/// lower half of each input operand is accessed.
-static bool
-getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
- int &HalfIdx1, int &HalfIdx2) {
+static bool getHalfShuffleMask(ArrayRef<int> Mask,
+ MutableArrayRef<int> HalfMask, int &HalfIdx1,
+ int &HalfIdx2) {
assert((Mask.size() == HalfMask.size() * 2) &&
"Expected input mask to be twice as long as output");
@@ -15962,7 +15960,8 @@ getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
ArrayRef<int> HalfMask, int HalfIdx1,
int HalfIdx2, bool UndefLower,
- SelectionDAG &DAG, bool UseConcat = false) {
+ SelectionDAG &DAG,
+ bool UseConcat = false) {
assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
assert(V1.getValueType().isSimple() && "Expecting only simple types");
@@ -16324,7 +16323,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
"Illegal shuffle mask");
- bool ZeroLane[2] = { true, true };
+ bool ZeroLane[2] = {true, true};
for (int i = 0; i < NumElts; ++i)
ZeroLane[i & 1] &= Zeroable[i];
@@ -16409,9 +16408,9 @@ static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
// the upper bits of the result using an unpckldq.
- SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
- { 0, 1, 2, 3, 16, 17, 18, 19,
- 4, 5, 6, 7, 20, 21, 22, 23 });
+ SDValue Unpack = DAG.getVectorShuffle(
+ MVT::v16i8, DL, V1, V2,
+ {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23});
// Insert the unpckldq into a zero vector to widen to v32i8.
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
DAG.getConstant(0, DL, MVT::v32i8), Unpack,
@@ -16648,8 +16647,8 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
- Subtarget, DAG))
+ if (SDValue Broadcast =
+ lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
// Try to use shift instructions if fast.
@@ -16756,8 +16755,8 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
- Subtarget, DAG))
+ if (SDValue Broadcast =
+ lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
if (!Subtarget.hasAVX2()) {
@@ -16904,8 +16903,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Blend;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
- Subtarget, DAG))
+ if (SDValue Broadcast =
+ lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
// Try to use shift instructions if fast.
@@ -17072,7 +17071,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to produce a fixed cross-128-bit lane permute followed by unpack
// because that should be faster than the variable permute alternatives.
- if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
+ if (SDValue V =
+ lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
return V;
// There are no generalized cross-lane shuffle operations available on i16
@@ -17091,8 +17091,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v16 case.
- return lowerV8I16GeneralSingleInputShuffle(
- DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+ return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v16i16, V1,
+ RepeatedMask, Subtarget, DAG);
}
}
@@ -17111,8 +17111,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Result;
// Try to permute the lanes and then use a per-lane permute.
- if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
- DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v16i16, V1, V2,
+ Mask, DAG, Subtarget))
return V;
// Try to match an interleave of two v16i16s and lower them as unpck and
@@ -17148,8 +17148,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return ZExt;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
- Subtarget, DAG))
+ if (SDValue Broadcast =
+ lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
@@ -17201,8 +17201,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
return V;
- if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
- DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v32i8, V1, V2,
+ Mask, DAG, Subtarget))
return V;
return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
@@ -17224,16 +17224,16 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Result;
// Try to permute the lanes and then use a per-lane permute.
- if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
- DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
+ if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v32i8, V1, V2,
+ Mask, DAG, Subtarget))
return V;
// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
// by zeroable elements in the remaining 24 elements. Turn this into two
// vmovqb instructions shuffled together.
if (Subtarget.hasVLX())
- if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
- Mask, Zeroable, DAG))
+ if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, Mask,
+ Zeroable, DAG))
return V;
// Try to match an interleave of two v32i8s and lower them as unpck and
@@ -17288,7 +17288,8 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
return V;
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
+ /*SimpleOnly*/ false);
}
MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
@@ -17537,8 +17538,7 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have a single input shuffle with different shuffle patterns in the
// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
- if (V2.isUndef() &&
- !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
+ if (V2.isUndef() && !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
}
@@ -17805,8 +17805,8 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Whenever we can lower this as a zext, that instruction is strictly faster
// than any alternative. It also allows us to fold memory operands into the
// shuffle in many cases.
- if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
- DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return ZExt;
// Use dedicated unpack instructions for masks that match their pattern.
@@ -17883,7 +17883,8 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Subtarget.hasVBMI())
return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
- return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
+ return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG,
+ /*SimpleOnly*/ false);
}
/// High-level routine to lower various 512-bit x86 vector shuffles.
@@ -17891,13 +17892,11 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// This routine either breaks down the specific type of a 512-bit x86 vector
/// shuffle or splits it into two 256-bit shuffles and fuses the results back
/// together based on the available instructions.
-static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
+static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2, const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(Subtarget.hasAVX512() &&
- "Cannot lower 512-bit vectors w/ basic ISA!");
+ assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!");
// If we have a single input to the zero element, insert that into V1 if we
// can do so cheaply.
@@ -17915,8 +17914,8 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
// Check for being able to broadcast a single element.
- if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
- Subtarget, DAG))
+ if (SDValue Broadcast =
+ lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
return Broadcast;
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
@@ -17928,7 +17927,8 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
+ /*SimpleOnly*/ false);
}
if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
@@ -18035,14 +18035,12 @@ static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
return -1;
}
-
// Lower vXi1 vector shuffles.
// There is no a dedicated instruction on AVX-512 that shuffles the masks.
// The only way to shuffle bits is to sign-extend the mask vector to SIMD
// vector, shuffle and then truncate it back.
-static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- const APInt &Zeroable,
+static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+ SDValue V1, SDValue V2, const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert(Subtarget.hasAVX512() &&
@@ -18173,8 +18171,8 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
int NumElems = VT.getVectorNumElements();
if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
(Subtarget.hasDQI() && (NumElems < 32)))
- return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
- Shuffle, ISD::SETGT);
+ return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT), Shuffle,
+ ISD::SETGT);
return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
}
@@ -18301,7 +18299,7 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
const X86Subtarget &Subtarget);
- /// Top-level lowering for x86 vector shuffles.
+/// Top-level lowering for x86 vector shuffles.
///
/// This handles decomposition, canonicalization, and lowering of all x86
/// vector shuffles. Most of the specific lowering strategies are encapsulated
@@ -18377,8 +18375,8 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
// by obfuscating the operands with bitcasts.
// TODO: Avoid lowering directly from this top-level function: make this
// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
- if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
- Subtarget, DAG))
+ if (SDValue Broadcast =
+ lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask, Subtarget, DAG))
return Broadcast;
MVT NewEltVT = VT.isFloatingPoint()
@@ -18601,8 +18599,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
// Build a mask by testing the condition against zero.
MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
- DAG.getConstant(0, dl, CondVT),
- ISD::SETNE);
+ DAG.getConstant(0, dl, CondVT), ISD::SETNE);
// Now return a new VSELECT using the mask.
return DAG.getSelect(dl, VT, Mask, LHS, RHS);
}
@@ -18709,7 +18706,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
}
if (VT == MVT::i32 || VT == MVT::i64)
- return Op;
+ return Op;
return SDValue();
}
@@ -18722,7 +18719,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
SDLoc dl(Vec);
MVT VecVT = Vec.getSimpleValueType();
SDValue Idx = Op.getOperand(1);
- auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
+ auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
MVT EltVT = Op.getSimpleValueType();
assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
@@ -18737,7 +18734,8 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
if (NumElts == 1) {
Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
MVT IntVT = MVT::getIntegerVT(Vec.getValueType().getVectorNumElements());
- return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+ DAG.getBitcast(IntVT, Vec));
}
MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
@@ -18795,14 +18793,13 @@ static APInt getExtractedDemandedElts(SDNode *N) {
return DemandedElts;
}
-SDValue
-X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+ SelectionDAG &DAG) const {
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
MVT VecVT = Vec.getSimpleValueType();
SDValue Idx = Op.getOperand(1);
- auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
+ auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
if (VecVT.getVectorElementType() == MVT::i1)
return ExtractBitFromMaskVector(Op, DAG, Subtarget);
@@ -18833,10 +18830,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// | | Ports pressure in cycles | |
// |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
// ---------------------------------------------------------
- // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
- // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
- // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
- // Total Num Of Uops: 4
+ // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18],
+ // xmm0 |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18] |1
+ // | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1] Total Num
+ // Of Uops: 4
return SDValue();
}
@@ -18941,7 +18938,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// UNPCKHPD the element to the lowest double word, then movsd.
// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
// to a f64mem, the whole operation is folded into a single MOVHPDmr.
- int Mask[2] = { 1, -1 };
+ int Mask[2] = {1, -1};
Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
DAG.getVectorIdxConstant(0, dl));
@@ -18966,9 +18963,10 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
unsigned NumElts = VecVT.getVectorNumElements();
MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
- SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
- DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
- DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
+ SDValue ExtOp =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+ DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
+ DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
}
@@ -18995,9 +18993,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
if (EltVT == MVT::bf16) {
MVT IVT = VT.changeVectorElementTypeToInteger();
- SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
- DAG.getBitcast(IVT, N0),
- DAG.getBitcast(MVT::i16, N1), N2);
+ SDValue Res =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT, DAG.getBitcast(IVT, N0),
+ DAG.getBitcast(MVT::i16, N1), N2);
return DAG.getBitcast(VT, Res);
}
@@ -19258,8 +19256,9 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
}
// Returns the appropriate wrapper opcode for a global reference.
-unsigned X86TargetLowering::getGlobalWrapperKind(
- const GlobalValue *GV, const unsigned char OpFlags) const {
+unsigned
+X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV,
+ const unsigned char OpFlags) const {
// References to absolute symbols are never PC-relative.
if (GV && GV->isAbsoluteSymbolRef())
return X86ISD::Wrapper;
@@ -19283,8 +19282,8 @@ unsigned X86TargetLowering::getGlobalWrapperKind(
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
// into MOV32ri.
-SDValue
-X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerConstantPool(SDValue Op,
+ SelectionDAG &DAG) const {
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
@@ -19334,11 +19333,10 @@ SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
}
-SDValue
-X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerBlockAddress(SDValue Op,
+ SelectionDAG &DAG) const {
// Create the TargetBlockAddressAddress node.
- unsigned char OpFlags =
- Subtarget.classifyBlockAddressReference();
+ unsigned char OpFlags = Subtarget.classifyBlockAddressReference();
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
SDLoc dl(Op);
@@ -19443,8 +19441,8 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
return Result;
}
-SDValue
-X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op,
+ SelectionDAG &DAG) const {
return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
}
@@ -19522,24 +19520,24 @@ static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA,
}
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
-static SDValue
-LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
- const EVT PtrVT) {
+static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG,
+ const EVT PtrVT) {
return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
/*LoadGlobalBaseReg=*/true);
}
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
-static SDValue
-LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
- const EVT PtrVT) {
+static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG,
+ const EVT PtrVT) {
return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
}
// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
-static SDValue
-LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
- const EVT PtrVT) {
+static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA,
+ SelectionDAG &DAG,
+ const EVT PtrVT) {
return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
}
@@ -19571,9 +19569,8 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
// Build x at dtpoff.
unsigned char OperandFlags = X86II::MO_DTPOFF;
unsigned WrapperKind = X86ISD::Wrapper;
- SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
- GA->getValueType(0),
- GA->getOffset(), OperandFlags);
+ SDValue TGA = DAG.getTargetGlobalAddress(
+ GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags);
SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
// Add x at dtpoff with the base.
@@ -19614,9 +19611,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
// emit "addl x at ntpoff,%eax" (local exec)
// or "addl x at indntpoff,%eax" (initial exec)
// or "addl x at gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
- SDValue TGA =
- DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
- GA->getOffset(), OperandFlags);
+ SDValue TGA = DAG.getTargetGlobalAddress(
+ GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags);
SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
if (model == TLSModel::InitialExec) {
@@ -19635,8 +19631,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
}
-SDValue
-X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+ SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
@@ -19650,20 +19646,20 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget.isTargetELF()) {
TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
switch (model) {
- case TLSModel::GeneralDynamic:
- if (Subtarget.is64Bit()) {
- if (Subtarget.isTarget64BitLP64())
- return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
- return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
- }
- return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
- case TLSModel::LocalDynamic:
- return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
- Subtarget.isTarget64BitLP64());
- case TLSModel::InitialExec:
- case TLSModel::LocalExec:
- return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
- PositionIndependent);
+ case TLSModel::GeneralDynamic:
+ if (Subtarget.is64Bit()) {
+ if (Subtarget.isTarget64BitLP64())
+ return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+ return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
+ }
+ return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
+ case TLSModel::LocalDynamic:
+ return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
+ Subtarget.isTarget64BitLP64());
+ case TLSModel::InitialExec:
+ case TLSModel::LocalExec:
+ return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
+ PositionIndependent);
}
llvm_unreachable("Unknown TLS model.");
}
@@ -19684,9 +19680,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
WrapperKind = X86ISD::WrapperRIP;
}
SDLoc DL(Op);
- SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
- GA->getValueType(0),
- GA->getOffset(), OpFlag);
+ SDValue Result = DAG.getTargetGlobalAddress(
+ GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag);
SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
// With PIC32, the address is actually $g + Offset.
@@ -19700,7 +19695,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
- SDValue Args[] = { Chain, Offset };
+ SDValue Args[] = {Chain, Offset};
Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
@@ -19768,9 +19763,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
// Get the offset of start of .tls section
- SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
- GA->getValueType(0),
- GA->getOffset(), X86II::MO_SECREL);
+ SDValue TGA =
+ DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+ GA->getOffset(), X86II::MO_SECREL);
SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
// The address of the thread local variable is the add of the thread
@@ -19830,8 +19825,8 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl,
MVT SrcVT = Src.getSimpleValueType();
MVT VT = Op.getSimpleValueType();
- if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
- (VT != MVT::f32 && VT != MVT::f64))
+ if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
+ (VT != MVT::f32 && VT != MVT::f64))
return SDValue();
// Pack the i64 into a vector, do the operation and extract.
@@ -19896,22 +19891,22 @@ static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG,
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
const X86Subtarget &Subtarget) {
switch (Opcode) {
- case ISD::SINT_TO_FP:
- // TODO: Handle wider types with AVX/AVX512.
- if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
- return false;
- // CVTDQ2PS or (V)CVTDQ2PD
- return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
-
- case ISD::UINT_TO_FP:
- // TODO: Handle wider types and i64 elements.
- if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
- return false;
- // VCVTUDQ2PS or VCVTUDQ2PD
- return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
+ case ISD::SINT_TO_FP:
+ // TODO: Handle wider types with AVX/AVX512.
+ if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
+ return false;
+ // CVTDQ2PS or (V)CVTDQ2PD
+ return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
- default:
+ case ISD::UINT_TO_FP:
+ // TODO: Handle wider types and i64 elements.
+ if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
return false;
+ // VCVTUDQ2PS or VCVTUDQ2PD
+ return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
+
+ default:
+ return false;
}
}
@@ -20055,7 +20050,7 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL,
return SDValue();
SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
- SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
+ SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
@@ -20275,7 +20270,7 @@ std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
Chain = Result.getValue(1);
}
- return { Result, Chain };
+ return {Result, Chain};
}
/// Horizontal vector math instructions may be slower than normal math with
@@ -20312,18 +20307,18 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl,
LLVMContext *Context = DAG.getContext();
// Build some magic constants.
- static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+ static const uint32_t CV0[] = {0x43300000, 0x45300000, 0, 0};
Constant *C0 = ConstantDataVector::get(*Context, CV0);
auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
- SmallVector<Constant*,2> CV1;
+ SmallVector<Constant *, 2> CV1;
CV1.push_back(
- ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
- APInt(64, 0x4330000000000000ULL))));
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
+ APInt(64, 0x4330000000000000ULL))));
CV1.push_back(
- ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
- APInt(64, 0x4530000000000000ULL))));
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
+ APInt(64, 0x4530000000000000ULL))));
Constant *C1 = ConstantVector::get(CV1);
SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
@@ -20344,11 +20339,10 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl,
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
- if (Subtarget.hasSSE3() &&
- shouldUseHorizontalOp(true, DAG, Subtarget)) {
+ if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) {
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
- SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
+ SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1, -1});
Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
}
Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
@@ -20374,8 +20368,7 @@ static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl,
// Or the load with the bias.
SDValue Or = DAG.getNode(
- ISD::OR, dl, MVT::v2i64,
- DAG.getBitcast(MVT::v2i64, Load),
+ ISD::OR, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, Load),
DAG.getBitcast(MVT::v2i64,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
@@ -20579,8 +20572,9 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
// Low will be bitcasted right away, so do not bother bitcasting back to its
// original type.
- Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
- VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
+ Low =
+ DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, VecCstLowBitcast,
+ DAG.getTargetConstant(0xaa, DL, MVT::i8));
// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
// (uint4) 0x53000000, 0xaa);
SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
@@ -20588,7 +20582,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
// High will be bitcasted right away, so do not bother bitcasting back to
// its original type.
High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
- VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
+ VecCstHighBitcast,
+ DAG.getTargetConstant(0xaa, DL, MVT::i8));
} else {
SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
@@ -20624,7 +20619,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
}
-static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG,
+static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl,
+ SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
SDValue N0 = Op.getOperand(OpNo);
@@ -20835,8 +20831,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
DstTy = MVT::i64;
}
- assert(DstTy.getSimpleVT() <= MVT::i64 &&
- DstTy.getSimpleVT() >= MVT::i16 &&
+ assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 &&
"Unknown FP_TO_INT to lower!");
// We lower FP->int64 into FISTP64 followed by a load from a temporary
@@ -20874,8 +20869,8 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
bool LosesInfo = false;
if (TheVT == MVT::f64)
// The rounding mode is irrelevant as the conversion should be exact.
- Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
- &LosesInfo);
+ Status = Thresh.convert(APFloat::IEEEdouble(),
+ APFloat::rmNearestTiesToEven, &LosesInfo);
else if (TheVT == MVT::f80)
Status = Thresh.convert(APFloat::x87DoubleExtended(),
APFloat::rmNearestTiesToEven, &LosesInfo);
@@ -20885,8 +20880,8 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
- EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
- *DAG.getContext(), TheVT);
+ EVT ResVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT);
SDValue Cmp;
if (IsStrict) {
Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
@@ -20915,8 +20910,8 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
DAG.getConstantFP(0.0, DL, TheVT));
if (IsStrict) {
- Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
- { Chain, Value, FltOfs });
+ Value = DAG.getNode(ISD::STRICT_FSUB, DL, {TheVT, MVT::Other},
+ {Chain, Value, FltOfs});
Chain = Value.getValue(1);
} else
Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
@@ -20930,7 +20925,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
- SDValue Ops[] = { Chain, StackSlot };
+ SDValue Ops[] = {Chain, StackSlot};
unsigned FLDSize = TheVT.getStoreSize();
assert(FLDSize <= MemSize && "Stack slot not big enough");
@@ -20943,10 +20938,9 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// Build the FP_TO_INT*_IN_MEM
MachineMemOperand *MMO = MF.getMachineMemOperand(
MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
- SDValue Ops[] = { Chain, Value, StackSlot };
- SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
- DAG.getVTList(MVT::Other),
- Ops, DstTy, MMO);
+ SDValue Ops[] = {Chain, Value, StackSlot};
+ SDValue FIST = DAG.getMemIntrinsicNode(
+ X86ISD::FP_TO_INT_IN_MEM, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO);
SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
Chain = Res.getValue(1);
@@ -21125,7 +21119,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
return In;
unsigned NumElems = SrcVT.getVectorNumElements();
- if (NumElems < 2 || !isPowerOf2_32(NumElems) )
+ if (NumElems < 2 || !isPowerOf2_32(NumElems))
return SDValue();
unsigned DstSizeInBits = DstVT.getSizeInBits();
@@ -21196,7 +21190,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
SmallVector<int, 64> Mask;
int Scale = 64 / OutVT.getScalarSizeInBits();
- narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
+ narrowShuffleMaskElts(Scale, {0, 2, 1, 3}, Mask);
Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
if (DstVT.is256BitVector())
@@ -21440,14 +21434,12 @@ static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL,
if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
// We need to shift to get the lsb into sign position.
// Shift packed bytes not supported natively, bitcast to word
- MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
- In = DAG.getNode(ISD::SHL, DL, ExtVT,
- DAG.getBitcast(ExtVT, In),
+ MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits() / 16);
+ In = DAG.getNode(ISD::SHL, DL, ExtVT, DAG.getBitcast(ExtVT, In),
DAG.getConstant(ShiftInx, DL, ExtVT));
In = DAG.getBitcast(InVT, In);
}
- return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
- In, ISD::SETGT);
+ return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
}
// Use TESTD/Q, extended vector to packed dword/qword.
assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
@@ -21485,7 +21477,8 @@ static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL,
// We either have 8 elements or we're allowed to use 512-bit vectors.
// If we have VLX, we want to use the narrowest vector that can get the
// job done so we use vXi32.
- MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
+ MVT EltVT =
+ Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512 / NumElts);
MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
InVT = ExtVT;
@@ -21599,10 +21592,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
if (Subtarget.hasInt256()) {
// The PSHUFB mask:
- static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
- -1, -1, -1, -1, -1, -1, -1, -1,
- 16, 17, 20, 21, 24, 25, 28, 29,
- -1, -1, -1, -1, -1, -1, -1, -1 };
+ static const int ShufMask1[] = {
+ 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1,
+ 16, 17, 20, 21, 24, 25, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1};
In = DAG.getBitcast(MVT::v32i8, In);
In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
In = DAG.getBitcast(MVT::v4i64, In);
@@ -21793,8 +21785,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
dl, {NVT, MVT::Other}, {Chain, Src});
Chain = Res.getValue(1);
} else {
- Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
- NVT, Src);
+ Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, NVT,
+ Src);
}
// TODO: Need to add exception check code for strict FP.
@@ -21896,8 +21888,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32));
if (IsStrict) {
- unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
- : X86ISD::STRICT_CVTTP2UI;
+ unsigned Opc =
+ IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
}
unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
@@ -22022,7 +22014,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
if (IsStrict)
- return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+ return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
return Tmp.first;
}
@@ -22085,7 +22077,7 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
- SDValue Ops[] = { Chain, StackPtr };
+ SDValue Ops[] = {Chain, StackPtr};
Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
/*Align*/ std::nullopt,
@@ -22093,7 +22085,7 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
Chain = Src.getValue(1);
}
- SDValue StoreOps[] = { Chain, Src, StackPtr };
+ SDValue StoreOps[] = {Chain, Src, StackPtr};
Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
MachineMemOperand::MOStore);
@@ -22101,8 +22093,8 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
}
-SDValue
-X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
+ SelectionDAG &DAG) const {
// This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
// but making use of X86 specifics to produce better instruction sequences.
SDNode *Node = Op.getNode();
@@ -22164,12 +22156,12 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
APFloat MinFloat(Sem);
APFloat MaxFloat(Sem);
- APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
- MinInt, IsSigned, APFloat::rmTowardZero);
- APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
- MaxInt, IsSigned, APFloat::rmTowardZero);
- bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
- && !(MaxStatus & APFloat::opStatus::opInexact);
+ APFloat::opStatus MinStatus =
+ MinFloat.convertFromAPInt(MinInt, IsSigned, APFloat::rmTowardZero);
+ APFloat::opStatus MaxStatus =
+ MaxFloat.convertFromAPInt(MaxInt, IsSigned, APFloat::rmTowardZero);
+ bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
+ !(MaxStatus & APFloat::opStatus::opInexact);
SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
@@ -22179,11 +22171,11 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
if (AreExactFloatBounds) {
if (DstVT != TmpVT) {
// Clamp by MinFloat from below. If Src is NaN, propagate NaN.
- SDValue MinClamped = DAG.getNode(
- X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
+ SDValue MinClamped =
+ DAG.getNode(X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
// Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
- SDValue BothClamped = DAG.getNode(
- X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
+ SDValue BothClamped =
+ DAG.getNode(X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
// Convert clamped value to integer.
SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
@@ -22193,11 +22185,11 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
}
// Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
- SDValue MinClamped = DAG.getNode(
- X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
+ SDValue MinClamped =
+ DAG.getNode(X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
// Clamp by MaxFloat from above. NaN cannot occur.
- SDValue BothClamped = DAG.getNode(
- X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
+ SDValue BothClamped =
+ DAG.getNode(X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
// Convert clamped value to integer.
SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
@@ -22209,8 +22201,8 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
// Otherwise, select zero if Src is NaN.
SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
- return DAG.getSelectCC(
- dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
+ return DAG.getSelectCC(dl, Src, Src, ZeroInt, FpToInt,
+ ISD::CondCode::SETUO);
}
SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
@@ -22232,13 +22224,13 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
// If Src ULT MinFloat, select MinInt. In particular, this also selects
// MinInt if Src is NaN.
- Select = DAG.getSelectCC(
- dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
+ Select = DAG.getSelectCC(dl, Src, MinFloatNode, MinIntNode, Select,
+ ISD::CondCode::SETULT);
}
// If Src OGT MaxFloat, select MaxInt.
- Select = DAG.getSelectCC(
- dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
+ Select = DAG.getSelectCC(dl, Src, MaxFloatNode, MaxIntNode, Select,
+ ISD::CondCode::SETOGT);
// In the unsigned case we are done, because we mapped NaN to MinInt, which
// is already zero. The promoted case was already handled above.
@@ -22248,8 +22240,7 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
// Otherwise, select 0 if Src is NaN.
SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
- return DAG.getSelectCC(
- dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
+ return DAG.getSelectCC(dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
}
SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
@@ -22304,15 +22295,15 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
Entry.IsZExt = true;
Args.push_back(Entry);
- SDValue Callee = DAG.getExternalSymbol(
- getLibcallName(RTLIB::FPEXT_F16_F32),
- getPointerTy(DAG.getDataLayout()));
+ SDValue Callee =
+ DAG.getExternalSymbol(getLibcallName(RTLIB::FPEXT_F16_F32),
+ getPointerTy(DAG.getDataLayout()));
CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
std::move(Args));
SDValue Res;
- std::tie(Res,Chain) = LowerCallTo(CLI);
+ std::tie(Res, Chain) = LowerCallTo(CLI);
if (IsStrict)
Res = DAG.getMergeValues({Res, Chain}, DL);
@@ -22579,14 +22570,14 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL,
// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
unsigned HOpcode;
switch (Op.getOpcode()) {
- // clang-format off
+ // clang-format off
case ISD::ADD: HOpcode = X86ISD::HADD; break;
case ISD::SUB: HOpcode = X86ISD::HSUB; break;
case ISD::FADD: HOpcode = X86ISD::FHADD; break;
case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
default:
llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
- // clang-format on
+ // clang-format on
}
unsigned LExtIndex = LHS.getConstantOperandVal(1);
unsigned RExtIndex = RHS.getConstantOperandVal(1);
@@ -22644,7 +22635,7 @@ static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
bool Ignored;
APFloat Point5Pred = APFloat(0.5f);
Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
- Point5Pred.next(/*nextDown*/true);
+ Point5Pred.next(/*nextDown*/ true);
SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
DAG.getConstantFP(Point5Pred, dl, VT), N0);
@@ -22694,16 +22685,16 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
unsigned EltBits = VT.getScalarSizeInBits();
// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
- APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
- APInt::getSignMask(EltBits);
+ APInt MaskElt =
+ IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
const fltSemantics &Sem = VT.getFltSemantics();
SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
SDValue Op0 = Op.getOperand(0);
bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
- unsigned LogicOp = IsFABS ? X86ISD::FAND :
- IsFNABS ? X86ISD::FOR :
- X86ISD::FXOR;
+ unsigned LogicOp = IsFABS ? X86ISD::FAND
+ : IsFNABS ? X86ISD::FOR
+ : X86ISD::FXOR;
SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
if (VT.isVector() || IsF128)
@@ -22806,7 +22797,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
}
/// Helper for attempting to create a X86ISD::BT node.
-static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
+static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL,
+ SelectionDAG &DAG) {
// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
// instruction. Since the shift amount is in-range-or-undefined, we know
// that doing a bittest on the i32 value is ok. We extend to i32 because
@@ -23422,8 +23414,7 @@ static bool hasNonFlagsUse(SDValue Op) {
// the node alone and emit a 'cmp' or 'test' instruction.
static bool isProfitableToUseFlagOp(SDValue Op) {
for (SDNode *U : Op->users())
- if (U->getOpcode() != ISD::CopyToReg &&
- U->getOpcode() != ISD::SETCC &&
+ if (U->getOpcode() != ISD::CopyToReg && U->getOpcode() != ISD::SETCC &&
U->getOpcode() != ISD::STORE)
return false;
@@ -23439,14 +23430,20 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
bool NeedCF = false;
bool NeedOF = false;
switch (X86CC) {
- default: break;
- case X86::COND_A: case X86::COND_AE:
- case X86::COND_B: case X86::COND_BE:
+ default:
+ break;
+ case X86::COND_A:
+ case X86::COND_AE:
+ case X86::COND_B:
+ case X86::COND_BE:
NeedCF = true;
break;
- case X86::COND_G: case X86::COND_GE:
- case X86::COND_L: case X86::COND_LE:
- case X86::COND_O: case X86::COND_NO: {
+ case X86::COND_G:
+ case X86::COND_GE:
+ case X86::COND_L:
+ case X86::COND_LE:
+ case X86::COND_O:
+ case X86::COND_NO: {
// Check if we really need to set the
// Overflow flag. If NoSignedWrap is present
// that is not actually needed.
@@ -23498,14 +23495,14 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
// Otherwise use a regular EFLAGS-setting instruction.
switch (ArithOp.getOpcode()) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("unexpected operator!");
case ISD::ADD: Opcode = X86ISD::ADD; break;
case ISD::SUB: Opcode = X86ISD::SUB; break;
case ISD::XOR: Opcode = X86ISD::XOR; break;
case ISD::AND: Opcode = X86ISD::AND; break;
case ISD::OR: Opcode = X86ISD::OR; break;
- // clang-format on
+ // clang-format on
}
NumOperands = 2;
@@ -23520,8 +23517,9 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
case ISD::USUBO: {
// /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
- return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
- Op->getOperand(1)).getValue(1);
+ return DAG
+ .getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0), Op->getOperand(1))
+ .getValue(1);
}
default:
break;
@@ -23550,8 +23548,9 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC,
EVT CmpVT = Op0.getValueType();
- assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
- CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
+ assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 ||
+ CmpVT == MVT::i64) &&
+ "Unexpected VT!");
// Only promote the compare up to I32 if it is a 16 bit operation
// with an immediate. 16 bit immediates are to be avoided unless the target
@@ -23678,9 +23677,8 @@ bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
/// The minimum architected relative accuracy is 2^-12. We need one
/// Newton-Raphson step to have a good float result (24 bits of precision).
-SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
- SelectionDAG &DAG, int Enabled,
- int &RefinementSteps,
+SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, SelectionDAG &DAG,
+ int Enabled, int &RefinementSteps,
bool &UseOneConstNR,
bool Reciprocal) const {
SDLoc DL(Op);
@@ -23787,9 +23785,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
/// This is because we still need one division to calculate the reciprocal and
/// then we need two multiplies by that reciprocal as replacements for the
/// original divisions.
-unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
- return 2;
-}
+unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; }
SDValue
X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
@@ -23797,7 +23793,7 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SmallVectorImpl<SDNode *> &Created) const {
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
if (isIntDivCheap(N->getValueType(0), Attr))
- return SDValue(N,0); // Lower SDIV as SDIV
+ return SDValue(N, 0); // Lower SDIV as SDIV
assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
"Unexpected divisor!");
@@ -23866,8 +23862,8 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
isPowerOf2_64(AndRHSVal)) {
Src = AndLHS;
- BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
- Src.getValueType());
+ BitNo =
+ DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, Src.getValueType());
}
}
}
@@ -23913,7 +23909,7 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
// 6 - NLE
// 7 - ORD
switch (SetCCOpcode) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETOEQ:
case ISD::SETEQ: SSECC = 0; break;
@@ -23935,7 +23931,7 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
case ISD::SETO: SSECC = 7; break;
case ISD::SETUEQ: SSECC = 8; break;
case ISD::SETONE: SSECC = 12; break;
- // clang-format on
+ // clang-format on
}
if (Swap)
std::swap(Op0, Op1);
@@ -24220,13 +24216,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
Cmp1 = DAG.getNode(
Opc, dl, {VT, MVT::Other},
{Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
- Cmp1.getValue(1));
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Cmp0.getValue(1), Cmp1.getValue(1));
} else {
- Cmp0 = DAG.getNode(
- Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
- Cmp1 = DAG.getNode(
- Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
+ Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getTargetConstant(CC0, dl, MVT::i8));
+ Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getTargetConstant(CC1, dl, MVT::i8));
}
Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
} else {
@@ -24236,8 +24232,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
Chain = Cmp.getValue(1);
} else
- Cmp = DAG.getNode(
- Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
+ Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getTargetConstant(SSECC, dl, MVT::i8));
}
} else {
// Handle all other FP comparisons here.
@@ -24249,8 +24245,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
Chain = Cmp.getValue(1);
} else
- Cmp = DAG.getNode(
- Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
+ Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
+ DAG.getTargetConstant(SSECC, dl, MVT::i8));
}
if (VT.getFixedSizeInBits() >
@@ -24301,7 +24297,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// Translate compare code to XOP PCOM compare mode.
unsigned CmpMode = 0;
switch (Cond) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("Unexpected SETCC condition");
case ISD::SETULT:
case ISD::SETLT: CmpMode = 0x00; break;
@@ -24313,7 +24309,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
case ISD::SETGE: CmpMode = 0x03; break;
case ISD::SETEQ: CmpMode = 0x04; break;
case ISD::SETNE: CmpMode = 0x05; break;
- // clang-format on
+ // clang-format on
}
// Are we comparing unsigned or signed integers?
@@ -24411,13 +24407,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
bool Invert = false;
unsigned Opc;
switch (Cond) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("Unexpected condition code");
case ISD::SETUGT: Invert = true; [[fallthrough]];
case ISD::SETULE: Opc = ISD::UMIN; break;
case ISD::SETULT: Invert = true; [[fallthrough]];
case ISD::SETUGE: Opc = ISD::UMAX; break;
- // clang-format on
+ // clang-format on
}
SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
@@ -24441,10 +24437,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// operations may be required for some comparisons.
unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
: X86ISD::PCMPGT;
- bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
- Cond == ISD::SETGE || Cond == ISD::SETUGE;
- bool Invert = Cond == ISD::SETNE ||
- (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
+ bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || Cond == ISD::SETGE ||
+ Cond == ISD::SETUGE;
+ bool Invert =
+ Cond == ISD::SETNE || (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
if (Swap)
std::swap(Op0, Op1);
@@ -24462,7 +24458,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
Op1 = DAG.getBitcast(MVT::v4i32, Op1);
SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
- static const int MaskHi[] = { 1, 1, 3, 3 };
+ static const int MaskHi[] = {1, 1, 3, 3};
SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
return DAG.getBitcast(VT, Result);
@@ -24473,7 +24469,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
- static const int MaskHi[] = { 1, 1, 3, 3 };
+ static const int MaskHi[] = {1, 1, 3, 3};
SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
return DAG.getBitcast(VT, Result);
@@ -24512,8 +24508,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
// Create masks for only the low parts/high parts of the 64 bit integers.
- static const int MaskHi[] = { 1, 1, 3, 3 };
- static const int MaskLo[] = { 0, 0, 2, 2 };
+ static const int MaskHi[] = {1, 1, 3, 3};
+ static const int MaskLo[] = {0, 0, 2, 2};
SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
@@ -24540,7 +24536,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
// Make sure the lower and upper halves are both all-ones.
- static const int Mask[] = { 1, 0, 3, 2 };
+ static const int Mask[] = {1, 0, 3, 2};
SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
@@ -24555,8 +24551,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// bits of the inputs before performing those operations.
if (FlipSigns) {
MVT EltVT = VT.getVectorElementType();
- SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
- VT);
+ SDValue SM =
+ DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl, VT);
Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
}
@@ -24573,8 +24569,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- SDValue &X86CC) {
+ const X86Subtarget &Subtarget, SDValue &X86CC) {
assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
// Must be a bitcast from vXi1.
@@ -24721,7 +24716,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
Op.getOpcode() == ISD::STRICT_FSETCCS;
MVT VT = Op->getSimpleValueType(0);
- if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+ if (VT.isVector())
+ return LowerVSETCC(Op, Subtarget, DAG);
assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
@@ -24816,7 +24812,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
}
-SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op,
+ SelectionDAG &DAG) const {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue Carry = Op.getOperand(2);
@@ -24828,8 +24825,8 @@ SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const
// Recreate the carry if needed.
EVT CarryVT = Carry.getValueType();
- Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
- Carry, DAG.getAllOnesConstant(DL, CarryVT));
+ Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry,
+ DAG.getAllOnesConstant(DL, CarryVT));
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
@@ -24849,7 +24846,8 @@ getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
unsigned BaseOp = 0;
SDLoc DL(Op);
switch (Op.getOpcode()) {
- default: llvm_unreachable("Unknown ovf instruction!");
+ default:
+ llvm_unreachable("Unknown ovf instruction!");
case ISD::SADDO:
BaseOp = X86ISD::ADD;
Cond = X86::COND_O;
@@ -24923,7 +24921,8 @@ static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
SDValue VOp0 = V.getOperand(0);
unsigned InBits = VOp0.getValueSizeInBits();
unsigned Bits = V.getValueSizeInBits();
- return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+ return DAG.MaskedValueIsZero(VOp0,
+ APInt::getHighBitsSet(InBits, InBits - Bits));
}
// Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
@@ -25061,7 +25060,7 @@ static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS,
SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
bool AddTest = true;
- SDValue Cond = Op.getOperand(0);
+ SDValue Cond = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue Op2 = Op.getOperand(2);
SDLoc DL(Op);
@@ -25212,14 +25211,13 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// If condition flag is set by a X86ISD::CMP, then use it as the condition
// setting operand in place of the X86ISD::SETCC.
unsigned CondOpcode = Cond.getOpcode();
- if (CondOpcode == X86ISD::SETCC ||
- CondOpcode == X86ISD::SETCC_CARRY) {
+ if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) {
CC = Cond.getOperand(0);
SDValue Cmp = Cond.getOperand(1);
bool IllegalFPCMov = false;
- if (VT.isFloatingPoint() && !VT.isVector() &&
- !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
+ if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT) &&
+ Subtarget.canUseCMOV()) // FPStack?
IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
@@ -25282,14 +25280,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
// widen the cmov and push the truncate through. This avoids introducing a new
// branch during isel and doesn't add any extensions.
- if (Op.getValueType() == MVT::i8 &&
- Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+ if (Op.getValueType() == MVT::i8 && Op1.getOpcode() == ISD::TRUNCATE &&
+ Op2.getOpcode() == ISD::TRUNCATE) {
SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
if (T1.getValueType() == T2.getValueType() &&
// Exclude CopyFromReg to avoid partial register stalls.
- T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
- SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
- CC, Cond);
+ T1.getOpcode() != ISD::CopyFromReg &&
+ T2.getOpcode() != ISD::CopyFromReg) {
+ SDValue Cmov =
+ DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1, CC, Cond);
return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
}
}
@@ -25305,14 +25304,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
!X86::mayFoldLoad(Op2, Subtarget))) {
Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
- SDValue Ops[] = { Op2, Op1, CC, Cond };
+ SDValue Ops[] = {Op2, Op1, CC, Cond};
SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
}
// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
// condition is true.
- SDValue Ops[] = { Op2, Op1, CC, Cond };
+ SDValue Ops[] = {Op2, Op1, CC, Cond};
return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
}
@@ -25372,7 +25371,7 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,
}
SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
- SDValue Cond = Op.getOperand(0); // condition
+ SDValue Cond = Op.getOperand(0); // condition
SDValue TrueOp = Op.getOperand(1); // true_value
SDValue FalseOp = Op.getOperand(2); // false_value
SDLoc DL(Op);
@@ -25533,6 +25532,69 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getBitcast(VT, CtSelect);
}
+ // Handle f80 types by splitting into three 32-bit chunks
+ if (VT == MVT::f80) {
+ SDValue Chain = DAG.getEntryNode();
+
+ // Create temporary stack slots for input f80 values
+ SDValue TrueSlot = DAG.CreateStackTemporary(MVT::f80);
+ SDValue FalseSlot = DAG.CreateStackTemporary(MVT::f80);
+
+ // Store f80 values to memory
+ SDValue StoreTrueF80 =
+ DAG.getStore(Chain, DL, TrueOp, TrueSlot, MachinePointerInfo());
+ SDValue StoreFalseF80 =
+ DAG.getStore(Chain, DL, FalseOp, FalseSlot, MachinePointerInfo());
+
+ // Load i32 parts from memory (3 chunks for 96-bit f80 storage)
+ SDValue TruePart0 =
+ DAG.getLoad(MVT::i32, DL, StoreTrueF80, TrueSlot, MachinePointerInfo());
+ SDValue TruePart1Ptr =
+ DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(4), DL);
+ SDValue TruePart1 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart1Ptr,
+ MachinePointerInfo());
+ SDValue TruePart2Ptr =
+ DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(8), DL);
+ SDValue TruePart2 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart2Ptr,
+ MachinePointerInfo());
+
+ SDValue FalsePart0 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalseSlot,
+ MachinePointerInfo());
+ SDValue FalsePart1Ptr =
+ DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(4), DL);
+ SDValue FalsePart1 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart1Ptr,
+ MachinePointerInfo());
+ SDValue FalsePart2Ptr =
+ DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(8), DL);
+ SDValue FalsePart2 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart2Ptr,
+ MachinePointerInfo());
+
+ // Perform CTSELECT on each 32-bit chunk
+ SDValue Part0Ops[] = {FalsePart0, TruePart0, CC, ProcessedCond};
+ SDValue Part0Select = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Part0Ops);
+ SDValue Part1Ops[] = {FalsePart1, TruePart1, CC, ProcessedCond};
+ SDValue Part1Select = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Part1Ops);
+ SDValue Part2Ops[] = {FalsePart2, TruePart2, CC, ProcessedCond};
+ SDValue Part2Select = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Part2Ops);
+
+ // Create result stack slot and store the selected parts
+ SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f80);
+ SDValue StorePart0 =
+ DAG.getStore(Chain, DL, Part0Select, ResultSlot, MachinePointerInfo());
+ SDValue ResPart1Ptr =
+ DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL);
+ SDValue StorePart1 = DAG.getStore(StorePart0, DL, Part1Select, ResPart1Ptr,
+ MachinePointerInfo());
+ SDValue ResPart2Ptr =
+ DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(8), DL);
+ SDValue StorePart2 = DAG.getStore(StorePart1, DL, Part2Select, ResPart2Ptr,
+ MachinePointerInfo());
+
+ // Load complete f80 result from memory
+ return DAG.getLoad(MVT::f80, DL, StorePart2, ResultSlot,
+ MachinePointerInfo());
+ }
+
// Create final CTSELECT node
SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
return DAG.getNode(X86ISD::CTSELECT, DL, Op.getValueType(), Ops,
@@ -25590,9 +25652,9 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
InVT = In.getSimpleValueType();
}
- // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
- // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
- // need to be handled here for 256/512-bit results.
+ // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit
+ // results, so are legal and shouldn't occur here. AVX2/AVX512 pmovsx*
+ // instructions still need to be handled here for 256/512-bit results.
if (Subtarget.hasInt256()) {
assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
@@ -25601,9 +25663,8 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
// FIXME: Apparently we create inreg operations that could be regular
// extends.
- unsigned ExtOpc =
- Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
- : ISD::ZERO_EXTEND;
+ unsigned ExtOpc = Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
+ : ISD::ZERO_EXTEND;
return DAG.getNode(ExtOpc, dl, VT, In);
}
@@ -25721,9 +25782,9 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
unsigned NumElems = InVT.getVectorNumElements();
- SmallVector<int,8> ShufMask(NumElems, -1);
- for (unsigned i = 0; i != NumElems/2; ++i)
- ShufMask[i] = i + NumElems/2;
+ SmallVector<int, 8> ShufMask(NumElems, -1);
+ for (unsigned i = 0; i != NumElems / 2; ++i)
+ ShufMask[i] = i + NumElems / 2;
SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
@@ -25885,11 +25946,10 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
// TODO: It is possible to support ZExt by zeroing the undef values during
// the shuffle phase or after the shuffle.
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG) {
MVT RegVT = Op.getSimpleValueType();
assert(RegVT.isVector() && "We only custom lower vector loads.");
- assert(RegVT.isInteger() &&
- "We only custom lower integer vector loads.");
+ assert(RegVT.isInteger() && "We only custom lower integer vector loads.");
LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
SDLoc dl(Ld);
@@ -25932,8 +25992,8 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
- SDValue Cond = Op.getOperand(1);
- SDValue Dest = Op.getOperand(2);
+ SDValue Cond = Op.getOperand(1);
+ SDValue Dest = Op.getOperand(2);
SDLoc dl(Op);
// Bail out when we don't have native compare instructions.
@@ -25983,7 +26043,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (User->getOpcode() == ISD::BR) {
SDValue FalseBB = User->getOperand(1);
SDNode *NewBR =
- DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+ DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
assert(NewBR == User);
(void)NewBR;
Dest = FalseBB;
@@ -26054,9 +26114,8 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
// bytes in one go. Touching the stack at 4K increments is necessary to ensure
// that the guard pages used by the OS virtual memory manager are allocated in
// correct sequence.
-SDValue
-X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool SplitStack = MF.shouldSplitStack();
bool EmitStackProbeCall = hasStackProbeSymbol(MF);
@@ -26067,7 +26126,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
// Get the inputs.
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
- SDValue Size = Op.getOperand(1);
+ SDValue Size = Op.getOperand(1);
MaybeAlign Alignment(Op.getConstantOperandVal(2));
EVT VT = Node->getValueType(0);
@@ -26190,8 +26249,9 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
MemOps.push_back(Store);
// Store ptr to reg_save_area.
- FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
- Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
+ FIN = DAG.getNode(
+ ISD::ADD, DL, PtrVT, FIN,
+ DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
Store = DAG.getStore(
Op.getOperand(0), DL, RSFIN, FIN,
@@ -26201,8 +26261,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
}
SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
- assert(Subtarget.is64Bit() &&
- "LowerVAARG only handles 64-bit va_arg!");
+ assert(Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!");
assert(Op.getNumOperands() == 4);
MachineFunction &MF = DAG.getMachineFunction();
@@ -26226,11 +26285,11 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
// selection mechanism works only for the basic types.
assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
- ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
+ ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
} else {
assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
"Unhandled argument type in LowerVAARG");
- ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
+ ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
}
if (ArgMode == 2) {
@@ -26264,7 +26323,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
// where a va_list is still an i8*.
assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
if (Subtarget.isCallingConvWin64(
- DAG.getMachineFunction().getFunction().getCallingConv()))
+ DAG.getMachineFunction().getFunction().getCallingConv()))
// Probably a Win64 va_copy.
return DAG.expandVACopy(Op.getNode());
@@ -26326,15 +26385,17 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
return DAG.getConstant(0, dl, VT);
}
- assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
- && "Unknown target vector shift-by-constant node");
+ assert(
+ (Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) &&
+ "Unknown target vector shift-by-constant node");
// Fold this packed vector shift into a build vector if SrcOp is a
// vector of Constants or UNDEFs.
if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
unsigned ShiftOpc;
switch (Opc) {
- default: llvm_unreachable("Unknown opcode!");
+ default:
+ llvm_unreachable("Unknown opcode!");
case X86ISD::VSHLI:
ShiftOpc = ISD::SHL;
break;
@@ -26474,8 +26535,8 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
Hi = DAG.getBitcast(MVT::v32i1, Hi);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
} else {
- MVT BitcastVT = MVT::getVectorVT(MVT::i1,
- Mask.getSimpleValueType().getSizeInBits());
+ MVT BitcastVT =
+ MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits());
// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
// are extracted by EXTRACT_SUBVECTOR.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
@@ -26556,9 +26617,12 @@ static int getSEHRegistrationNodeSize(const Function *Fn) {
// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
// WinEHStatePass for the full struct definition.
switch (classifyEHPersonality(Fn->getPersonalityFn())) {
- case EHPersonality::MSVC_X86SEH: return 24;
- case EHPersonality::MSVC_CXX: return 16;
- default: break;
+ case EHPersonality::MSVC_X86SEH:
+ return 24;
+ case EHPersonality::MSVC_CXX:
+ return 16;
+ default:
+ break;
}
report_fatal_error(
"can only recover FP for 32-bit MSVC EH personality functions");
@@ -26648,13 +26712,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc dl(Op);
unsigned IntNo = Op.getConstantOperandVal(0);
MVT VT = Op.getSimpleValueType();
- const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
+ const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
// Propagate flags from original node to transformed node(s).
SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
if (IntrData) {
- switch(IntrData->Type) {
+ switch (IntrData->Type) {
case INTR_TYPE_1OP: {
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
@@ -26780,9 +26844,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (!isRoundModeCurDirection(Rnd))
return SDValue();
}
- return getVectorMaskingNode(
- DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
- Subtarget, DAG);
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+ Mask, PassThru, Subtarget, DAG);
}
case INTR_TYPE_1OP_MASK_SAE: {
SDValue Src = Op.getOperand(1);
@@ -26823,9 +26886,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (!isRoundModeCurDirection(Rnd))
return SDValue();
}
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
- Src2),
- Mask, passThru, Subtarget, DAG);
+ return getScalarMaskingNode(
+ DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), Mask, passThru,
+ Subtarget, DAG);
}
assert(Op.getNumOperands() == (6U + HasRounding) &&
@@ -26839,9 +26902,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
else if (!isRoundModeCurDirection(Sae))
return SDValue();
}
- return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
- Src2, RoundingMode),
- Mask, passThru, Subtarget, DAG);
+ return getScalarMaskingNode(
+ DAG.getNode(Opc, dl, VT, Src1, Src2, RoundingMode), Mask, passThru,
+ Subtarget, DAG);
}
case INTR_TYPE_SCALAR_MASK_RND: {
SDValue Src1 = Op.getOperand(1);
@@ -26876,8 +26939,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
else
return SDValue();
- return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
- Mask, passThru, Subtarget, DAG);
+ return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask,
+ passThru, Subtarget, DAG);
}
case INTR_TYPE_2OP_MASK: {
SDValue Src1 = Op.getOperand(1);
@@ -26913,8 +26976,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return SDValue();
}
- return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
- Mask, PassThru, Subtarget, DAG);
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask,
+ PassThru, Subtarget, DAG);
}
case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
SDValue Src1 = Op.getOperand(1);
@@ -26963,12 +27026,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// Reverse the operands to match VSELECT order.
return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
}
- case VPERM_2OP : {
+ case VPERM_2OP: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
// Swap Src1 and Src2 in the node creation
- return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
+ return DAG.getNode(IntrData->Opc0, dl, VT, Src2, Src1);
}
case CFMA_OP_MASKZ:
case CFMA_OP_MASK: {
@@ -27012,8 +27075,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue Imm = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
- SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
- Subtarget, DAG);
+ SDValue FPclassMask =
+ getScalarMaskingNode(FPclass, Mask, SDValue(), Subtarget, DAG);
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
@@ -27037,7 +27100,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (!isRoundModeCurDirection(Sae))
return SDValue();
}
- //default rounding mode
+ // default rounding mode
return DAG.getNode(IntrData->Opc0, dl, MaskVT,
{Op.getOperand(1), Op.getOperand(2), CC, Mask});
}
@@ -27055,12 +27118,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
else if (!isRoundModeCurDirection(Sae))
return SDValue();
}
- //default rounding mode
+ // default rounding mode
if (!Cmp.getNode())
Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
- SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
- Subtarget, DAG);
+ SDValue CmpMask =
+ getScalarMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG);
// Need to fill with zeros to ensure the bitcast will produce zeroes
// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
@@ -27228,8 +27291,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
uint64_t Imm = Op.getConstantOperandVal(2);
- SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
- Op.getValueType());
+ SDValue Control =
+ DAG.getTargetConstant(Imm & 0xffff, dl, Op.getValueType());
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
Op.getOperand(1), Control);
}
@@ -27251,7 +27314,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(3), GenCF.getValue(1));
}
SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
- SDValue Results[] = { SetCC, Res };
+ SDValue Results[] = {SetCC, Res};
return DAG.getMergeValues(Results, dl);
}
case CVTPD2PS_MASK:
@@ -27334,7 +27397,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
switch (IntNo) {
- default: return SDValue(); // Don't custom lower most intrinsics.
+ default:
+ return SDValue(); // Don't custom lower most intrinsics.
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
@@ -27368,7 +27432,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned TestOpc = X86ISD::PTEST;
X86::CondCode X86CC;
switch (IntNo) {
- default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+ default:
+ llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
case Intrinsic::x86_avx512_ktestc_b:
case Intrinsic::x86_avx512_ktestc_w:
case Intrinsic::x86_avx512_ktestc_d:
@@ -27439,7 +27504,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned Opcode;
X86::CondCode X86CC;
switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ default:
+ llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_sse42_pcmpistria128:
Opcode = X86ISD::PCMPISTR;
X86CC = X86::COND_A;
@@ -27609,7 +27675,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned NewIntrinsic;
switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ default:
+ llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_mmx_pslli_w:
NewIntrinsic = Intrinsic::x86_mmx_psll_w;
break;
@@ -27686,16 +27753,16 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
- SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
SDValue Res =
DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
MemIntr->getMemoryVT(), MemIntr->getMemOperand());
return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
}
-static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
- SDValue Src, SDValue Mask, SDValue Base,
- SDValue Index, SDValue ScaleOp, SDValue Chain,
+static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src,
+ SDValue Mask, SDValue Base, SDValue Index,
+ SDValue ScaleOp, SDValue Chain,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
@@ -27724,7 +27791,7 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
- SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+ SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
SDValue Res =
DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
MemIntr->getMemoryVT(), MemIntr->getMemOperand());
@@ -27732,9 +27799,9 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
}
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Src, SDValue Mask, SDValue Base,
- SDValue Index, SDValue ScaleOp, SDValue Chain,
- const X86Subtarget &Subtarget) {
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
SDLoc dl(Op);
auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
// Scale must be constant.
@@ -27776,8 +27843,8 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
TLI.getPointerTy(DAG.getDataLayout()));
SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
- MVT MaskVT =
- MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+ MVT MaskVT = MVT::getVectorVT(
+ MVT::i1, Index.getSimpleValueType().getVectorNumElements());
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
@@ -27793,11 +27860,11 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
/// expanded intrinsics implicitly defines extra registers (i.e. not just
/// EDX:EAX).
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
- SelectionDAG &DAG,
- unsigned TargetOpcode,
- unsigned SrcReg,
- const X86Subtarget &Subtarget,
- SmallVectorImpl<SDValue> &Results) {
+ SelectionDAG &DAG,
+ unsigned TargetOpcode,
+ unsigned SrcReg,
+ const X86Subtarget &Subtarget,
+ SmallVectorImpl<SDValue> &Results) {
SDValue Chain = N->getOperand(0);
SDValue Glue;
@@ -27837,7 +27904,7 @@ static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
}
// Use a buildpair to merge the two 32-bit values into a 64-bit one.
- SDValue Ops[] = { LO, HI };
+ SDValue Ops[] = {LO, HI};
SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
Results.push_back(Pair);
Results.push_back(Chain);
@@ -27854,9 +27921,9 @@ static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
// The processor's time-stamp counter (a 64-bit MSR) is stored into the
// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
// and the EAX register is loaded with the low-order 32 bits.
- SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
- /* NoRegister */0, Subtarget,
- Results);
+ SDValue Glue =
+ expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
+ /* NoRegister */ 0, Subtarget, Results);
if (Opcode != X86::RDTSCP)
return;
@@ -27914,24 +27981,24 @@ static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
}
/// Emit Truncating Store with signed or unsigned saturation.
-static SDValue
-EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
- SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
- SelectionDAG &DAG) {
+static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL,
+ SDValue Val, SDValue Ptr, EVT MemVT,
+ MachineMemOperand *MMO, SelectionDAG &DAG) {
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
- SDValue Ops[] = { Chain, Val, Ptr, Undef };
+ SDValue Ops[] = {Chain, Val, Ptr, Undef};
unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
}
/// Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
- const SDLoc &DL,
- SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
- MachineMemOperand *MMO, SelectionDAG &DAG) {
+ const SDLoc &DL, SDValue Val, SDValue Ptr,
+ SDValue Mask, EVT MemVT,
+ MachineMemOperand *MMO,
+ SelectionDAG &DAG) {
SDVTList VTs = DAG.getVTList(MVT::Other);
- SDValue Ops[] = { Chain, Val, Ptr, Mask };
+ SDValue Ops[] = {Chain, Val, Ptr, Mask};
unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
}
@@ -27999,9 +28066,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SDLoc dl(Op);
// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
// to the EDX and ECX parameters.
- return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
- Op.getOperand(0), Op.getOperand(2),
- DAG.getConstant(0, dl, MVT::i32),
+ return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other, Op.getOperand(0),
+ Op.getOperand(2), DAG.getConstant(0, dl, MVT::i32),
DAG.getConstant(0, dl, MVT::i32));
}
case llvm::Intrinsic::asan_check_memaccess: {
@@ -28032,7 +28098,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
unsigned Opcode;
switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic");
+ default:
+ llvm_unreachable("Impossible intrinsic");
case Intrinsic::x86_umwait:
Opcode = X86ISD::UMWAIT;
break;
@@ -28045,9 +28112,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
break;
}
- SDValue Operation =
- DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
- Op->getOperand(3), Op->getOperand(4));
+ SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
+ Op->getOperand(3), Op->getOperand(4));
SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
Operation.getValue(1));
@@ -28059,7 +28125,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
unsigned Opcode;
switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic!");
+ default:
+ llvm_unreachable("Impossible intrinsic!");
case Intrinsic::x86_enqcmd:
Opcode = X86ISD::ENQCMD;
break;
@@ -28083,7 +28150,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
unsigned Opcode;
switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic");
+ default:
+ llvm_unreachable("Impossible intrinsic");
case Intrinsic::x86_aesenc128kl:
Opcode = X86ISD::AESENC128KL;
break;
@@ -28121,7 +28189,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
unsigned Opcode;
switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic");
+ default:
+ llvm_unreachable("Impossible intrinsic");
case Intrinsic::x86_aesencwide128kl:
Opcode = X86ISD::AESENCWIDE128KL;
break;
@@ -28215,9 +28284,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SDValue Src2 = Op.getOperand(4);
SDValue CC = Op.getOperand(5);
MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
- SDValue Operation = DAG.getMemIntrinsicNode(
- X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
- MVT::i32, MMO);
+ SDValue Operation =
+ DAG.getMemIntrinsicNode(X86ISD::CMPCCXADD, DL, Op->getVTList(),
+ {Chain, Addr, Src1, Src2, CC}, MVT::i32, MMO);
return Operation;
}
case Intrinsic::x86_aadd32:
@@ -28301,8 +28370,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
}
SDLoc dl(Op);
- switch(IntrData->Type) {
- default: llvm_unreachable("Unknown Intrinsic Type");
+ switch (IntrData->Type) {
+ default:
+ llvm_unreachable("Unknown Intrinsic Type");
case RDSEED:
case RDRAND: {
// Emit the node with the right value type.
@@ -28323,32 +28393,32 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
}
case GATHER_AVX2: {
SDValue Chain = Op.getOperand(0);
- SDValue Src = Op.getOperand(2);
- SDValue Base = Op.getOperand(3);
+ SDValue Src = Op.getOperand(2);
+ SDValue Base = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
- SDValue Mask = Op.getOperand(5);
+ SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
Scale, Chain, Subtarget);
}
case GATHER: {
- //gather(v1, mask, index, base, scale);
+ // gather(v1, mask, index, base, scale);
SDValue Chain = Op.getOperand(0);
- SDValue Src = Op.getOperand(2);
- SDValue Base = Op.getOperand(3);
+ SDValue Src = Op.getOperand(2);
+ SDValue Base = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
- SDValue Mask = Op.getOperand(5);
+ SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
- return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
- Chain, Subtarget);
+ return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+ Subtarget);
}
case SCATTER: {
- //scatter(base, mask, index, v1, scale);
+ // scatter(base, mask, index, v1, scale);
SDValue Chain = Op.getOperand(0);
- SDValue Base = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
+ SDValue Base = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
- SDValue Src = Op.getOperand(5);
+ SDValue Src = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
Scale, Chain, Subtarget);
@@ -28359,9 +28429,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
"Wrong prefetch hint in intrinsic: should be 2 or 3");
unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
SDValue Chain = Op.getOperand(0);
- SDValue Mask = Op.getOperand(2);
+ SDValue Mask = Op.getOperand(2);
SDValue Index = Op.getOperand(3);
- SDValue Base = Op.getOperand(4);
+ SDValue Base = Op.getOperand(4);
SDValue Scale = Op.getOperand(5);
return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
Subtarget);
@@ -28396,8 +28466,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
- return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
- Ret, SDValue(InTrans.getNode(), 1));
+ return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret,
+ SDValue(InTrans.getNode(), 1));
}
case TRUNCATE_TO_MEM_VI8:
case TRUNCATE_TO_MEM_VI16:
@@ -28410,7 +28480,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
assert(MemIntr && "Expected MemIntrinsicSDNode!");
- EVT MemVT = MemIntr->getMemoryVT();
+ EVT MemVT = MemIntr->getMemoryVT();
uint16_t TruncationOp = IntrData->Opc0;
switch (TruncationOp) {
@@ -28505,7 +28575,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
Register FrameReg =
RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
- SDLoc dl(Op); // FIXME probably not meaningful
+ SDLoc dl(Op); // FIXME probably not meaningful
unsigned Depth = Op.getConstantOperandVal(0);
assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
(FrameReg == X86::EBP && VT == MVT::i32)) &&
@@ -28519,7 +28589,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
+Register X86TargetLowering::getRegisterByName(const char *RegName, LLT VT,
const MachineFunction &MF) const {
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
@@ -28576,10 +28646,10 @@ bool X86TargetLowering::needsFixedCatchObjects() const {
}
SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
- SDValue Chain = Op.getOperand(0);
- SDValue Offset = Op.getOperand(1);
- SDValue Handler = Op.getOperand(2);
- SDLoc dl (Op);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Offset = Op.getOperand(1);
+ SDValue Handler = Op.getOperand(2);
+ SDLoc dl(Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -28590,9 +28660,9 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
- SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
- DAG.getIntPtrConstant(RegInfo->getSlotSize(),
- dl));
+ SDValue StoreAddr =
+ DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
+ DAG.getIntPtrConstant(RegInfo->getSlotSize(), dl));
StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
@@ -28615,19 +28685,20 @@ SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
}
return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
- DAG.getVTList(MVT::i32, MVT::Other),
- Op.getOperand(0), Op.getOperand(1));
+ DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
+ Op.getOperand(1));
}
SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
- return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
- Op.getOperand(0), Op.getOperand(1));
+ return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0),
+ Op.getOperand(1));
}
-SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue
+X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+ SelectionDAG &DAG) const {
SDLoc DL(Op);
return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
Op.getOperand(0));
@@ -28643,7 +28714,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SDValue Trmp = Op.getOperand(1); // trampoline
SDValue FPtr = Op.getOperand(2); // nested function
SDValue Nest = Op.getOperand(3); // 'nest' parameter value
- SDLoc dl (Op);
+ SDLoc dl(Op);
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -28652,7 +28723,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
SDValue OutChains[6];
// Large code-model.
- const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
+ const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
@@ -28700,7 +28771,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
} else {
const Function *Func =
- cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+ cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
CallingConv::ID CC = Func->getCallingConv();
unsigned NestReg;
@@ -28722,7 +28793,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
unsigned Idx = 0;
for (FunctionType::param_iterator I = FTy->param_begin(),
- E = FTy->param_end(); I != E; ++I, ++Idx)
+ E = FTy->param_end();
+ I != E; ++I, ++Idx)
if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
const DataLayout &DL = DAG.getDataLayout();
// FIXME: should only count parameters that are lowered to integers.
@@ -28828,18 +28900,16 @@ SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
Chain = CWD.getValue(1);
// Mask and turn the control bits into a shift for the lookup table.
- SDValue Shift =
- DAG.getNode(ISD::SRL, DL, MVT::i16,
- DAG.getNode(ISD::AND, DL, MVT::i16,
- CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
- DAG.getConstant(9, DL, MVT::i8));
+ SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i16,
+ DAG.getNode(ISD::AND, DL, MVT::i16, CWD,
+ DAG.getConstant(0xc00, DL, MVT::i16)),
+ DAG.getConstant(9, DL, MVT::i8));
Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
- SDValue RetVal =
- DAG.getNode(ISD::AND, DL, MVT::i32,
- DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
- DAG.getConstant(3, DL, MVT::i32));
+ SDValue RetVal = DAG.getNode(ISD::AND, DL, MVT::i32,
+ DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
+ DAG.getConstant(3, DL, MVT::i32));
RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
@@ -29125,17 +29195,15 @@ static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
MVT EltVT = VT.getVectorElementType();
unsigned NumElems = VT.getVectorNumElements();
- assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
- "Unsupported element type");
+ assert((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type");
// Split vector, it's Lo and Hi parts will be handled in next iteration.
- if (NumElems > 16 ||
- (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
+ if (NumElems > 16 || (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
return splitVectorIntUnary(Op, DAG, dl);
MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
- "Unsupported value type for operation");
+ "Unsupported value type for operation");
// Use native supported vector instruction vplzcntd.
Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
@@ -29807,10 +29875,10 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
SmallVector<SDValue, 16> LoOps, HiOps;
for (unsigned i = 0; i != NumElts; i += 16) {
for (unsigned j = 0; j != 8; ++j) {
- LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
- MVT::i16));
- HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
- MVT::i16));
+ LoOps.push_back(
+ DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl, MVT::i16));
+ HiOps.push_back(
+ DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl, MVT::i16));
}
}
@@ -29851,7 +29919,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
// Merge the two vectors back together with a shuffle. This expands into 2
// shuffles.
- static const int ShufMask[] = { 0, 4, 2, 6 };
+ static const int ShufMask[] = {0, 4, 2, 6};
return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
}
@@ -30016,7 +30084,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
//
// Place the odd value at an even position (basically, shift all values 1
// step to the left):
- const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
+ const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
9, -1, 11, -1, 13, -1, 15, -1};
// <a|b|c|d> => <b|undef|d|undef>
SDValue Odd0 =
@@ -30066,7 +30134,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
// Only i8 vectors should need custom lowering after this.
assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
- (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
"Unsupported vector type");
// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
@@ -30221,7 +30289,8 @@ static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getMergeValues({Low, Ovf}, dl);
}
-SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op,
+ SelectionDAG &DAG) const {
assert(Subtarget.isTargetWin64() && "Unexpected target");
EVT VT = Op.getValueType();
assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
@@ -30236,13 +30305,13 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
RTLIB::Libcall LC;
bool isSigned;
switch (Op->getOpcode()) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("Unexpected request for libcall!");
case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
- // clang-format on
+ // clang-format on
}
SDLoc dl(Op);
@@ -30381,9 +30450,9 @@ static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
// The shift amount is a variable, but it is the same for all vector lanes.
// These instructions are defined together with shift-immediate.
-static
-bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget,
- unsigned Opcode) {
+static bool supportedVectorShiftWithBaseAmnt(EVT VT,
+ const X86Subtarget &Subtarget,
+ unsigned Opcode) {
return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
}
@@ -30412,7 +30481,7 @@ static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
return true;
bool LShift = VT.is128BitVector() || VT.is256BitVector();
- bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
+ bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
return (Opcode == ISD::SRA) ? AShift : LShift;
}
@@ -32350,7 +32419,8 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
auto SSID = AI->getSyncScopeID();
// We must restrict the ordering to avoid generating loads with Release or
// ReleaseAcquire orderings.
- auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
+ auto Order =
+ AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
// Before the load we need a fence. Here is an example lifted from
// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
@@ -32419,31 +32489,28 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
if (Subtarget.is64Bit()) {
SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
- SDValue Ops[] = {
- DAG.getRegister(X86::RSP, MVT::i64), // Base
- DAG.getTargetConstant(1, DL, MVT::i8), // Scale
- DAG.getRegister(0, MVT::i64), // Index
- DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
- DAG.getRegister(0, MVT::i16), // Segment.
- Zero,
- Chain};
- SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
- MVT::Other, Ops);
+ SDValue Ops[] = {DAG.getRegister(X86::RSP, MVT::i64), // Base
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i64), // Index
+ DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i16), // Segment.
+ Zero,
+ Chain};
+ SDNode *Res =
+ DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, MVT::Other, Ops);
return SDValue(Res, 1);
}
SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
- SDValue Ops[] = {
- DAG.getRegister(X86::ESP, MVT::i32), // Base
- DAG.getTargetConstant(1, DL, MVT::i8), // Scale
- DAG.getRegister(0, MVT::i32), // Index
- DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
- DAG.getRegister(0, MVT::i16), // Segment.
- Zero,
- Chain
- };
- SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
- MVT::Other, Ops);
+ SDValue Ops[] = {DAG.getRegister(X86::ESP, MVT::i32), // Base
+ DAG.getTargetConstant(1, DL, MVT::i8), // Scale
+ DAG.getRegister(0, MVT::i32), // Index
+ DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+ DAG.getRegister(0, MVT::i16), // Segment.
+ Zero,
+ Chain};
+ SDNode *Res =
+ DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, MVT::Other, Ops);
return SDValue(Res, 1);
}
@@ -32476,36 +32543,44 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
SDLoc DL(Op);
unsigned Reg = 0;
unsigned size = 0;
- switch(T.SimpleTy) {
- default: llvm_unreachable("Invalid value type!");
- case MVT::i8: Reg = X86::AL; size = 1; break;
- case MVT::i16: Reg = X86::AX; size = 2; break;
- case MVT::i32: Reg = X86::EAX; size = 4; break;
+ switch (T.SimpleTy) {
+ default:
+ llvm_unreachable("Invalid value type!");
+ case MVT::i8:
+ Reg = X86::AL;
+ size = 1;
+ break;
+ case MVT::i16:
+ Reg = X86::AX;
+ size = 2;
+ break;
+ case MVT::i32:
+ Reg = X86::EAX;
+ size = 4;
+ break;
case MVT::i64:
assert(Subtarget.is64Bit() && "Node not type legal!");
- Reg = X86::RAX; size = 8;
+ Reg = X86::RAX;
+ size = 8;
break;
}
- SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
- Op.getOperand(2), SDValue());
- SDValue Ops[] = { cpIn.getValue(0),
- Op.getOperand(1),
- Op.getOperand(3),
- DAG.getTargetConstant(size, DL, MVT::i8),
- cpIn.getValue(1) };
+ SDValue cpIn =
+ DAG.getCopyToReg(Op.getOperand(0), DL, Reg, Op.getOperand(2), SDValue());
+ SDValue Ops[] = {cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3),
+ DAG.getTargetConstant(size, DL, MVT::i8), cpIn.getValue(1)};
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
- SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
- Ops, T, MMO);
+ SDValue Result =
+ DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO);
SDValue cpOut =
- DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
+ DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
MVT::i32, cpOut.getValue(2));
SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
- return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
- cpOut, Success, EFLAGS.getValue(1));
+ return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), cpOut, Success,
+ EFLAGS.getValue(1));
}
// Create MOVMSKB, taking into account whether we need to split for AVX1.
@@ -32567,7 +32642,8 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
}
assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
- SrcVT == MVT::i64) && "Unexpected VT!");
+ SrcVT == MVT::i64) &&
+ "Unexpected VT!");
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
@@ -32581,8 +32657,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
// Example: from MVT::v2i32 to MVT::v4i32.
MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
SrcVT.getVectorNumElements() * 2);
- Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
- DAG.getUNDEF(SrcVT));
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, DAG.getUNDEF(SrcVT));
} else {
assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
"Unexpected source type in LowerBITCAST");
@@ -32728,7 +32803,8 @@ static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL,
if (Subtarget.hasVPOPCNTDQ()) {
unsigned NumElems = VT.getVectorNumElements();
assert((VT.getVectorElementType() == MVT::i8 ||
- VT.getVectorElementType() == MVT::i16) && "Unexpected type");
+ VT.getVectorElementType() == MVT::i16) &&
+ "Unexpected type");
if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
@@ -33127,16 +33203,16 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
assert(!N->hasAnyUseOfValue(0));
// NOTE: The getUNDEF is needed to give something for the unused result 0.
- return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
- DAG.getUNDEF(VT), NewChain);
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT),
+ NewChain);
}
SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
// RAUW the chain, but don't worry about the result, as it's unused.
assert(!N->hasAnyUseOfValue(0));
// NOTE: The getUNDEF is needed to give something for the unused result 0.
- return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
- DAG.getUNDEF(VT), LockOp.getValue(1));
+ return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT),
+ LockOp.getValue(1));
}
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
@@ -33236,17 +33312,17 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
// Set the carry flag.
SDValue Carry = Op.getOperand(2);
EVT CarryVT = Carry.getValueType();
- Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
- Carry, DAG.getAllOnesConstant(DL, CarryVT));
+ Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry,
+ DAG.getAllOnesConstant(DL, CarryVT));
bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
- SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
- Op.getOperand(0), Op.getOperand(1),
- Carry.getValue(1));
+ SDValue Sum =
+ DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs, Op.getOperand(0),
+ Op.getOperand(1), Carry.getValue(1));
bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
- SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
- Sum.getValue(1), DL, DAG);
+ SDValue SetCC =
+ getSETCC(IsSigned ? X86::COND_O : X86::COND_B, Sum.getValue(1), DL, DAG);
if (N->getValueType(1) == MVT::i1)
SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
@@ -33397,8 +33473,8 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {
// Determine how much we need to widen by to get a 512-bit type.
- unsigned Factor = std::min(512/VT.getSizeInBits(),
- 512/IndexVT.getSizeInBits());
+ unsigned Factor =
+ std::min(512 / VT.getSizeInBits(), 512 / IndexVT.getSizeInBits());
unsigned NumElts = VT.getVectorNumElements() * Factor;
VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
@@ -33440,7 +33516,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
N->isExpandingLoad());
// Emit a blend.
SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
- return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
+ return DAG.getMergeValues({Select, NewLoad.getValue(1)}, dl);
}
assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
@@ -33507,7 +33583,7 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
// This operation is legal for targets with VLX, but without
// VLX the vector should be widened to 512 bit
- unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+ unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
// Mask element has to be i1.
@@ -33549,8 +33625,8 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
!IndexVT.is512BitVector()) {
// Determine how much we need to widen by to get a 512-bit type.
- unsigned Factor = std::min(512/VT.getSizeInBits(),
- 512/IndexVT.getSizeInBits());
+ unsigned Factor =
+ std::min(512 / VT.getSizeInBits(), 512 / IndexVT.getSizeInBits());
unsigned NumElts = VT.getVectorNumElements() * Factor;
@@ -33567,8 +33643,8 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
if (PassThru.isUndef())
PassThru = getZeroVector(VT, Subtarget, DAG, dl);
- SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
- N->getScale() };
+ SDValue Ops[] = {N->getChain(), PassThru, Mask,
+ N->getBasePtr(), Index, N->getScale()};
SDValue NewGather = DAG.getMemIntrinsicNode(
X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
N->getMemOperand());
@@ -33766,7 +33842,7 @@ SDValue X86TargetLowering::visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL,
/// Provide custom lowering hooks for some operations.
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("Should not custom lower this!");
case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
@@ -33923,7 +33999,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
- // clang-format on
+ // clang-format on
}
}
@@ -33936,7 +34012,7 @@ bool X86TargetLowering::isSelectSupported(SelectSupportKind Kind) const {
/// Replace a node with an illegal result type with a new node built out of
/// custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
- SmallVectorImpl<SDValue>&Results,
+ SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
SDLoc dl(N);
unsigned Opc = N->getOpcode();
@@ -34062,8 +34138,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
// Widen the result with by padding with undef.
- Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
- DAG.getUNDEF(VT));
+ Res =
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, DAG.getUNDEF(VT));
Results.push_back(Res);
Results.push_back(Ovf);
return;
@@ -34080,11 +34156,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
"Unexpected type action!");
unsigned NumConcat = 128 / InVT.getSizeInBits();
- EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
- InVT.getVectorElementType(),
- NumConcat * InVT.getVectorNumElements());
- EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
- VT.getVectorElementType(),
+ EVT InWideVT =
+ EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
+ NumConcat * InVT.getVectorNumElements());
+ EVT WideVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
NumConcat * VT.getVectorNumElements());
SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
@@ -34148,7 +34223,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+ SDValue V = LowerWin64_i128OP(SDValue(N, 0), DAG);
Results.push_back(V);
return;
}
@@ -34226,9 +34301,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
- SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
- { 0, 1, 2, 3, 16, 17, 18, 19,
- -1, -1, -1, -1, -1, -1, -1, -1 });
+ SDValue Res = DAG.getVectorShuffle(
+ MVT::v16i8, dl, Lo, Hi,
+ {0, 1, 2, 3, 16, 17, 18, 19, -1, -1, -1, -1, -1, -1, -1, -1});
Results.push_back(Res);
return;
}
@@ -34260,7 +34335,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue In = N->getOperand(0);
EVT InVT = In.getValueType();
if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
- (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
+ (InVT == MVT::v4i16 || InVT == MVT::v4i8)) {
assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
"Unexpected type action!");
assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
@@ -34276,11 +34351,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
// Create an unpackl and unpackh to interleave the sign bits then bitcast
// to v2i64.
- SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
- {0, 4, 1, 5});
+ SDValue Lo =
+ DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, {0, 4, 1, 5});
Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
- SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
- {2, 6, 3, 7});
+ SDValue Hi =
+ DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, {2, 6, 3, 7});
Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
@@ -34467,7 +34542,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
-
if (VT == MVT::v2i32) {
assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
"Strict unsigned conversion requires AVX512");
@@ -34552,9 +34626,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
- SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
- DAG.getConstantFP(0.0, dl, VecInVT), Src,
- ZeroIdx);
+ SDValue Res =
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
+ DAG.getConstantFP(0.0, dl, VecInVT), Src, ZeroIdx);
SDValue Chain;
if (IsStrict) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
@@ -34641,8 +34715,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
EVT SrcVT = Src.getValueType();
if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
if (IsStrict) {
- unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
- : X86ISD::STRICT_CVTUI2P;
+ unsigned Opc =
+ IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
{N->getOperand(0), Src});
Results.push_back(Res);
@@ -34656,7 +34730,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
SDValue Zero = DAG.getConstant(0, dl, SrcVT);
- SDValue One = DAG.getConstant(1, dl, SrcVT);
+ SDValue One = DAG.getConstant(1, dl, SrcVT);
SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
@@ -34722,9 +34796,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (IsStrict) {
SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
{N->getOperand(0), Or, VBias});
- SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
- {MVT::v4f32, MVT::Other},
- {Sub.getValue(1), Sub});
+ SDValue Res =
+ DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
+ {Sub.getValue(1), Sub});
Results.push_back(Res);
Results.push_back(Res.getValue(1));
} else {
@@ -34805,8 +34879,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::INTRINSIC_W_CHAIN: {
unsigned IntNo = N->getConstantOperandVal(1);
switch (IntNo) {
- default : llvm_unreachable("Do not know how to custom type "
- "legalize this intrinsic operation!");
+ default:
+ llvm_unreachable("Do not know how to custom type "
+ "legalize this intrinsic operation!");
case Intrinsic::x86_rdtsc:
return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
Results);
@@ -34819,7 +34894,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
case Intrinsic::x86_rdpru:
expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
- Results);
+ Results);
return;
case Intrinsic::x86_xgetbv:
expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
@@ -34876,12 +34951,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
- Regs64bit ? X86::RAX : X86::EAX,
- HalfT, Result.getValue(1));
+ Regs64bit ? X86::RAX : X86::EAX, HalfT,
+ Result.getValue(1));
SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
- Regs64bit ? X86::RDX : X86::EDX,
- HalfT, cpOutL.getValue(2));
- SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+ Regs64bit ? X86::RDX : X86::EDX, HalfT,
+ cpOutL.getValue(2));
+ SDValue OpsF[] = {cpOutL.getValue(0), cpOutH.getValue(0)};
SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
MVT::i32, cpOutH.getValue(2));
@@ -34923,7 +34998,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
// Then extract the lower 64-bits.
MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
- SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
+ SDValue Ops[] = {Node->getChain(), Node->getBasePtr()};
SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
MVT::i64, Node->getMemOperand());
if (Subtarget.hasSSE2()) {
@@ -34947,10 +35022,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
// First load this into an 80-bit X87 register. This will put the whole
// integer into the significand.
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
- SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
- SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
- dl, Tys, Ops, MVT::i64,
- Node->getMemOperand());
+ SDValue Ops[] = {Node->getChain(), Node->getBasePtr()};
+ SDValue Result = DAG.getMemIntrinsicNode(
+ X86ISD::FILD, dl, Tys, Ops, MVT::i64, Node->getMemOperand());
SDValue Chain = Result.getValue(1);
// Now store the X87 register to a stack temporary and convert to i64.
@@ -34961,7 +35035,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
- SDValue StoreOps[] = { Chain, Result, StackPtr };
+ SDValue StoreOps[] = {Chain, Result, StackPtr};
Chain = DAG.getMemIntrinsicNode(
X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
@@ -35019,8 +35093,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!");
EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
- SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
- N->getOperand(0));
+ SDValue Res =
+ DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, N->getOperand(0));
Res = DAG.getBitcast(WideVT, Res);
Results.push_back(Res);
return;
@@ -35042,8 +35116,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue Mask = Gather->getMask();
assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
- Gather->getPassThru(),
- DAG.getUNDEF(VT));
+ Gather->getPassThru(), DAG.getUNDEF(VT));
if (!Subtarget.hasVLX()) {
// We need to widen the mask, but the instruction will only use 2
// of its elements. So we can use undef.
@@ -35051,8 +35124,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
DAG.getUNDEF(MVT::v2i1));
Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
}
- SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
- Gather->getBasePtr(), Index, Gather->getScale() };
+ SDValue Ops[] = {Gather->getChain(), PassThru, Mask,
+ Gather->getBasePtr(), Index, Gather->getScale()};
SDValue Res = DAG.getMemIntrinsicNode(
X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
Gather->getMemoryVT(), Gather->getMemOperand());
@@ -35097,7 +35170,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::ADDRSPACECAST: {
- SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
+ SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
Results.push_back(V);
return;
}
@@ -35128,470 +35201,473 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((X86ISD::NodeType)Opcode) {
- case X86ISD::FIRST_NUMBER: break;
-#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
- NODE_NAME_CASE(BSF)
- NODE_NAME_CASE(BSR)
- NODE_NAME_CASE(FSHL)
- NODE_NAME_CASE(FSHR)
- NODE_NAME_CASE(FAND)
- NODE_NAME_CASE(FANDN)
- NODE_NAME_CASE(FOR)
- NODE_NAME_CASE(FXOR)
- NODE_NAME_CASE(FILD)
- NODE_NAME_CASE(FIST)
- NODE_NAME_CASE(FP_TO_INT_IN_MEM)
- NODE_NAME_CASE(FLD)
- NODE_NAME_CASE(FST)
- NODE_NAME_CASE(CALL)
- NODE_NAME_CASE(CALL_RVMARKER)
- NODE_NAME_CASE(IMP_CALL)
- NODE_NAME_CASE(BT)
- NODE_NAME_CASE(CMP)
- NODE_NAME_CASE(FCMP)
- NODE_NAME_CASE(STRICT_FCMP)
- NODE_NAME_CASE(STRICT_FCMPS)
- NODE_NAME_CASE(COMI)
- NODE_NAME_CASE(UCOMI)
- NODE_NAME_CASE(COMX)
- NODE_NAME_CASE(UCOMX)
- NODE_NAME_CASE(CMPM)
- NODE_NAME_CASE(CMPMM)
- NODE_NAME_CASE(STRICT_CMPM)
- NODE_NAME_CASE(CMPMM_SAE)
- NODE_NAME_CASE(SETCC)
- NODE_NAME_CASE(CTSELECT)
- NODE_NAME_CASE(SETCC_CARRY)
- NODE_NAME_CASE(FSETCC)
- NODE_NAME_CASE(FSETCCM)
- NODE_NAME_CASE(FSETCCM_SAE)
- NODE_NAME_CASE(CMOV)
- NODE_NAME_CASE(BRCOND)
- NODE_NAME_CASE(RET_GLUE)
- NODE_NAME_CASE(IRET)
- NODE_NAME_CASE(REP_STOS)
- NODE_NAME_CASE(REP_MOVS)
- NODE_NAME_CASE(GlobalBaseReg)
- NODE_NAME_CASE(Wrapper)
- NODE_NAME_CASE(WrapperRIP)
- NODE_NAME_CASE(MOVQ2DQ)
- NODE_NAME_CASE(MOVDQ2Q)
- NODE_NAME_CASE(MMX_MOVD2W)
- NODE_NAME_CASE(MMX_MOVW2D)
- NODE_NAME_CASE(PEXTRB)
- NODE_NAME_CASE(PEXTRW)
- NODE_NAME_CASE(INSERTPS)
- NODE_NAME_CASE(PINSRB)
- NODE_NAME_CASE(PINSRW)
- NODE_NAME_CASE(PSHUFB)
- NODE_NAME_CASE(ANDNP)
- NODE_NAME_CASE(BLENDI)
- NODE_NAME_CASE(BLENDV)
- NODE_NAME_CASE(HADD)
- NODE_NAME_CASE(HSUB)
- NODE_NAME_CASE(FHADD)
- NODE_NAME_CASE(FHSUB)
- NODE_NAME_CASE(CONFLICT)
- NODE_NAME_CASE(FMAX)
- NODE_NAME_CASE(FMAXS)
- NODE_NAME_CASE(FMAX_SAE)
- NODE_NAME_CASE(FMAXS_SAE)
- NODE_NAME_CASE(STRICT_FMAX)
- NODE_NAME_CASE(FMIN)
- NODE_NAME_CASE(FMINS)
- NODE_NAME_CASE(FMIN_SAE)
- NODE_NAME_CASE(FMINS_SAE)
- NODE_NAME_CASE(STRICT_FMIN)
- NODE_NAME_CASE(FMAXC)
- NODE_NAME_CASE(FMINC)
- NODE_NAME_CASE(FRSQRT)
- NODE_NAME_CASE(FRCP)
- NODE_NAME_CASE(EXTRQI)
- NODE_NAME_CASE(INSERTQI)
- NODE_NAME_CASE(TLSADDR)
- NODE_NAME_CASE(TLSBASEADDR)
- NODE_NAME_CASE(TLSCALL)
- NODE_NAME_CASE(TLSDESC)
- NODE_NAME_CASE(EH_SJLJ_SETJMP)
- NODE_NAME_CASE(EH_SJLJ_LONGJMP)
- NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
- NODE_NAME_CASE(EH_RETURN)
- NODE_NAME_CASE(TC_RETURN)
- NODE_NAME_CASE(FNSTCW16m)
- NODE_NAME_CASE(FLDCW16m)
- NODE_NAME_CASE(FNSTENVm)
- NODE_NAME_CASE(FLDENVm)
- NODE_NAME_CASE(LCMPXCHG_DAG)
- NODE_NAME_CASE(LCMPXCHG8_DAG)
- NODE_NAME_CASE(LCMPXCHG16_DAG)
- NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
- NODE_NAME_CASE(LADD)
- NODE_NAME_CASE(LSUB)
- NODE_NAME_CASE(LOR)
- NODE_NAME_CASE(LXOR)
- NODE_NAME_CASE(LAND)
- NODE_NAME_CASE(LBTS)
- NODE_NAME_CASE(LBTC)
- NODE_NAME_CASE(LBTR)
- NODE_NAME_CASE(LBTS_RM)
- NODE_NAME_CASE(LBTC_RM)
- NODE_NAME_CASE(LBTR_RM)
- NODE_NAME_CASE(AADD)
- NODE_NAME_CASE(AOR)
- NODE_NAME_CASE(AXOR)
- NODE_NAME_CASE(AAND)
- NODE_NAME_CASE(VZEXT_MOVL)
- NODE_NAME_CASE(VZEXT_LOAD)
- NODE_NAME_CASE(VEXTRACT_STORE)
- NODE_NAME_CASE(VTRUNC)
- NODE_NAME_CASE(VTRUNCS)
- NODE_NAME_CASE(VTRUNCUS)
- NODE_NAME_CASE(VMTRUNC)
- NODE_NAME_CASE(VMTRUNCS)
- NODE_NAME_CASE(VMTRUNCUS)
- NODE_NAME_CASE(VTRUNCSTORES)
- NODE_NAME_CASE(VTRUNCSTOREUS)
- NODE_NAME_CASE(VMTRUNCSTORES)
- NODE_NAME_CASE(VMTRUNCSTOREUS)
- NODE_NAME_CASE(VFPEXT)
- NODE_NAME_CASE(STRICT_VFPEXT)
- NODE_NAME_CASE(VFPEXT_SAE)
- NODE_NAME_CASE(VFPEXTS)
- NODE_NAME_CASE(VFPEXTS_SAE)
- NODE_NAME_CASE(VFPROUND)
- NODE_NAME_CASE(VFPROUND2)
- NODE_NAME_CASE(VFPROUND2_RND)
- NODE_NAME_CASE(STRICT_VFPROUND)
- NODE_NAME_CASE(VMFPROUND)
- NODE_NAME_CASE(VFPROUND_RND)
- NODE_NAME_CASE(VFPROUNDS)
- NODE_NAME_CASE(VFPROUNDS_RND)
- NODE_NAME_CASE(VSHLDQ)
- NODE_NAME_CASE(VSRLDQ)
- NODE_NAME_CASE(VSHL)
- NODE_NAME_CASE(VSRL)
- NODE_NAME_CASE(VSRA)
- NODE_NAME_CASE(VSHLI)
- NODE_NAME_CASE(VSRLI)
- NODE_NAME_CASE(VSRAI)
- NODE_NAME_CASE(VSHLV)
- NODE_NAME_CASE(VSRLV)
- NODE_NAME_CASE(VSRAV)
- NODE_NAME_CASE(VROTLI)
- NODE_NAME_CASE(VROTRI)
- NODE_NAME_CASE(VPPERM)
- NODE_NAME_CASE(CMPP)
- NODE_NAME_CASE(STRICT_CMPP)
- NODE_NAME_CASE(PCMPEQ)
- NODE_NAME_CASE(PCMPGT)
- NODE_NAME_CASE(PHMINPOS)
- NODE_NAME_CASE(ADD)
- NODE_NAME_CASE(SUB)
- NODE_NAME_CASE(ADC)
- NODE_NAME_CASE(SBB)
- NODE_NAME_CASE(SMUL)
- NODE_NAME_CASE(UMUL)
- NODE_NAME_CASE(OR)
- NODE_NAME_CASE(XOR)
- NODE_NAME_CASE(AND)
- NODE_NAME_CASE(BEXTR)
- NODE_NAME_CASE(BEXTRI)
- NODE_NAME_CASE(BZHI)
- NODE_NAME_CASE(PDEP)
- NODE_NAME_CASE(PEXT)
- NODE_NAME_CASE(MUL_IMM)
- NODE_NAME_CASE(MOVMSK)
- NODE_NAME_CASE(PTEST)
- NODE_NAME_CASE(TESTP)
- NODE_NAME_CASE(KORTEST)
- NODE_NAME_CASE(KTEST)
- NODE_NAME_CASE(KADD)
- NODE_NAME_CASE(KSHIFTL)
- NODE_NAME_CASE(KSHIFTR)
- NODE_NAME_CASE(PACKSS)
- NODE_NAME_CASE(PACKUS)
- NODE_NAME_CASE(PALIGNR)
- NODE_NAME_CASE(VALIGN)
- NODE_NAME_CASE(VSHLD)
- NODE_NAME_CASE(VSHRD)
- NODE_NAME_CASE(PSHUFD)
- NODE_NAME_CASE(PSHUFHW)
- NODE_NAME_CASE(PSHUFLW)
- NODE_NAME_CASE(SHUFP)
- NODE_NAME_CASE(SHUF128)
- NODE_NAME_CASE(MOVLHPS)
- NODE_NAME_CASE(MOVHLPS)
- NODE_NAME_CASE(MOVDDUP)
- NODE_NAME_CASE(MOVSHDUP)
- NODE_NAME_CASE(MOVSLDUP)
- NODE_NAME_CASE(MOVSD)
- NODE_NAME_CASE(MOVSS)
- NODE_NAME_CASE(MOVSH)
- NODE_NAME_CASE(UNPCKL)
- NODE_NAME_CASE(UNPCKH)
- NODE_NAME_CASE(VBROADCAST)
- NODE_NAME_CASE(VBROADCAST_LOAD)
- NODE_NAME_CASE(VBROADCASTM)
- NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
- NODE_NAME_CASE(VPERMILPV)
- NODE_NAME_CASE(VPERMILPI)
- NODE_NAME_CASE(VPERM2X128)
- NODE_NAME_CASE(VPERMV)
- NODE_NAME_CASE(VPERMV3)
- NODE_NAME_CASE(VPERMI)
- NODE_NAME_CASE(VPTERNLOG)
- NODE_NAME_CASE(FP_TO_SINT_SAT)
- NODE_NAME_CASE(FP_TO_UINT_SAT)
- NODE_NAME_CASE(VFIXUPIMM)
- NODE_NAME_CASE(VFIXUPIMM_SAE)
- NODE_NAME_CASE(VFIXUPIMMS)
- NODE_NAME_CASE(VFIXUPIMMS_SAE)
- NODE_NAME_CASE(VRANGE)
- NODE_NAME_CASE(VRANGE_SAE)
- NODE_NAME_CASE(VRANGES)
- NODE_NAME_CASE(VRANGES_SAE)
- NODE_NAME_CASE(PMULUDQ)
- NODE_NAME_CASE(PMULDQ)
- NODE_NAME_CASE(PSADBW)
- NODE_NAME_CASE(DBPSADBW)
- NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
- NODE_NAME_CASE(VAARG_64)
- NODE_NAME_CASE(VAARG_X32)
- NODE_NAME_CASE(DYN_ALLOCA)
- NODE_NAME_CASE(MFENCE)
- NODE_NAME_CASE(SEG_ALLOCA)
- NODE_NAME_CASE(PROBED_ALLOCA)
- NODE_NAME_CASE(RDRAND)
- NODE_NAME_CASE(RDSEED)
- NODE_NAME_CASE(RDPKRU)
- NODE_NAME_CASE(WRPKRU)
- NODE_NAME_CASE(VPMADDUBSW)
- NODE_NAME_CASE(VPMADDWD)
- NODE_NAME_CASE(VPSHA)
- NODE_NAME_CASE(VPSHL)
- NODE_NAME_CASE(VPCOM)
- NODE_NAME_CASE(VPCOMU)
- NODE_NAME_CASE(VPERMIL2)
- NODE_NAME_CASE(FMSUB)
- NODE_NAME_CASE(STRICT_FMSUB)
- NODE_NAME_CASE(FNMADD)
- NODE_NAME_CASE(STRICT_FNMADD)
- NODE_NAME_CASE(FNMSUB)
- NODE_NAME_CASE(STRICT_FNMSUB)
- NODE_NAME_CASE(FMADDSUB)
- NODE_NAME_CASE(FMSUBADD)
- NODE_NAME_CASE(FMADD_RND)
- NODE_NAME_CASE(FNMADD_RND)
- NODE_NAME_CASE(FMSUB_RND)
- NODE_NAME_CASE(FNMSUB_RND)
- NODE_NAME_CASE(FMADDSUB_RND)
- NODE_NAME_CASE(FMSUBADD_RND)
- NODE_NAME_CASE(VFMADDC)
- NODE_NAME_CASE(VFMADDC_RND)
- NODE_NAME_CASE(VFCMADDC)
- NODE_NAME_CASE(VFCMADDC_RND)
- NODE_NAME_CASE(VFMULC)
- NODE_NAME_CASE(VFMULC_RND)
- NODE_NAME_CASE(VFCMULC)
- NODE_NAME_CASE(VFCMULC_RND)
- NODE_NAME_CASE(VFMULCSH)
- NODE_NAME_CASE(VFMULCSH_RND)
- NODE_NAME_CASE(VFCMULCSH)
- NODE_NAME_CASE(VFCMULCSH_RND)
- NODE_NAME_CASE(VFMADDCSH)
- NODE_NAME_CASE(VFMADDCSH_RND)
- NODE_NAME_CASE(VFCMADDCSH)
- NODE_NAME_CASE(VFCMADDCSH_RND)
- NODE_NAME_CASE(VPMADD52H)
- NODE_NAME_CASE(VPMADD52L)
- NODE_NAME_CASE(VRNDSCALE)
- NODE_NAME_CASE(STRICT_VRNDSCALE)
- NODE_NAME_CASE(VRNDSCALE_SAE)
- NODE_NAME_CASE(VRNDSCALES)
- NODE_NAME_CASE(VRNDSCALES_SAE)
- NODE_NAME_CASE(VREDUCE)
- NODE_NAME_CASE(VREDUCE_SAE)
- NODE_NAME_CASE(VREDUCES)
- NODE_NAME_CASE(VREDUCES_SAE)
- NODE_NAME_CASE(VGETMANT)
- NODE_NAME_CASE(VGETMANT_SAE)
- NODE_NAME_CASE(VGETMANTS)
- NODE_NAME_CASE(VGETMANTS_SAE)
- NODE_NAME_CASE(PCMPESTR)
- NODE_NAME_CASE(PCMPISTR)
- NODE_NAME_CASE(XTEST)
- NODE_NAME_CASE(COMPRESS)
- NODE_NAME_CASE(EXPAND)
- NODE_NAME_CASE(SELECTS)
- NODE_NAME_CASE(ADDSUB)
- NODE_NAME_CASE(RCP14)
- NODE_NAME_CASE(RCP14S)
- NODE_NAME_CASE(RSQRT14)
- NODE_NAME_CASE(RSQRT14S)
- NODE_NAME_CASE(FADD_RND)
- NODE_NAME_CASE(FADDS)
- NODE_NAME_CASE(FADDS_RND)
- NODE_NAME_CASE(FSUB_RND)
- NODE_NAME_CASE(FSUBS)
- NODE_NAME_CASE(FSUBS_RND)
- NODE_NAME_CASE(FMUL_RND)
- NODE_NAME_CASE(FMULS)
- NODE_NAME_CASE(FMULS_RND)
- NODE_NAME_CASE(FDIV_RND)
- NODE_NAME_CASE(FDIVS)
- NODE_NAME_CASE(FDIVS_RND)
- NODE_NAME_CASE(FSQRT_RND)
- NODE_NAME_CASE(FSQRTS)
- NODE_NAME_CASE(FSQRTS_RND)
- NODE_NAME_CASE(FGETEXP)
- NODE_NAME_CASE(FGETEXP_SAE)
- NODE_NAME_CASE(FGETEXPS)
- NODE_NAME_CASE(FGETEXPS_SAE)
- NODE_NAME_CASE(SCALEF)
- NODE_NAME_CASE(SCALEF_RND)
- NODE_NAME_CASE(SCALEFS)
- NODE_NAME_CASE(SCALEFS_RND)
- NODE_NAME_CASE(MULHRS)
- NODE_NAME_CASE(SINT_TO_FP_RND)
- NODE_NAME_CASE(UINT_TO_FP_RND)
- NODE_NAME_CASE(CVTTP2SI)
- NODE_NAME_CASE(CVTTP2UI)
- NODE_NAME_CASE(STRICT_CVTTP2SI)
- NODE_NAME_CASE(STRICT_CVTTP2UI)
- NODE_NAME_CASE(MCVTTP2SI)
- NODE_NAME_CASE(MCVTTP2UI)
- NODE_NAME_CASE(CVTTP2SI_SAE)
- NODE_NAME_CASE(CVTTP2UI_SAE)
- NODE_NAME_CASE(CVTTS2SI)
- NODE_NAME_CASE(CVTTS2UI)
- NODE_NAME_CASE(CVTTS2SI_SAE)
- NODE_NAME_CASE(CVTTS2UI_SAE)
- NODE_NAME_CASE(CVTSI2P)
- NODE_NAME_CASE(CVTUI2P)
- NODE_NAME_CASE(STRICT_CVTSI2P)
- NODE_NAME_CASE(STRICT_CVTUI2P)
- NODE_NAME_CASE(MCVTSI2P)
- NODE_NAME_CASE(MCVTUI2P)
- NODE_NAME_CASE(VFPCLASS)
- NODE_NAME_CASE(VFPCLASSS)
- NODE_NAME_CASE(MULTISHIFT)
- NODE_NAME_CASE(SCALAR_SINT_TO_FP)
- NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
- NODE_NAME_CASE(SCALAR_UINT_TO_FP)
- NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
- NODE_NAME_CASE(CVTPS2PH)
- NODE_NAME_CASE(STRICT_CVTPS2PH)
- NODE_NAME_CASE(CVTPS2PH_SAE)
- NODE_NAME_CASE(MCVTPS2PH)
- NODE_NAME_CASE(MCVTPS2PH_SAE)
- NODE_NAME_CASE(CVTPH2PS)
- NODE_NAME_CASE(STRICT_CVTPH2PS)
- NODE_NAME_CASE(CVTPH2PS_SAE)
- NODE_NAME_CASE(CVTP2SI)
- NODE_NAME_CASE(CVTP2UI)
- NODE_NAME_CASE(MCVTP2SI)
- NODE_NAME_CASE(MCVTP2UI)
- NODE_NAME_CASE(CVTP2SI_RND)
- NODE_NAME_CASE(CVTP2UI_RND)
- NODE_NAME_CASE(CVTS2SI)
- NODE_NAME_CASE(CVTS2UI)
- NODE_NAME_CASE(CVTS2SI_RND)
- NODE_NAME_CASE(CVTS2UI_RND)
- NODE_NAME_CASE(CVTNEPS2BF16)
- NODE_NAME_CASE(MCVTNEPS2BF16)
- NODE_NAME_CASE(DPBF16PS)
- NODE_NAME_CASE(DPFP16PS)
- NODE_NAME_CASE(MPSADBW)
- NODE_NAME_CASE(LWPINS)
- NODE_NAME_CASE(MGATHER)
- NODE_NAME_CASE(MSCATTER)
- NODE_NAME_CASE(VPDPBUSD)
- NODE_NAME_CASE(VPDPBUSDS)
- NODE_NAME_CASE(VPDPWSSD)
- NODE_NAME_CASE(VPDPWSSDS)
- NODE_NAME_CASE(VPSHUFBITQMB)
- NODE_NAME_CASE(GF2P8MULB)
- NODE_NAME_CASE(GF2P8AFFINEQB)
- NODE_NAME_CASE(GF2P8AFFINEINVQB)
- NODE_NAME_CASE(NT_CALL)
- NODE_NAME_CASE(NT_BRIND)
- NODE_NAME_CASE(UMWAIT)
- NODE_NAME_CASE(TPAUSE)
- NODE_NAME_CASE(ENQCMD)
- NODE_NAME_CASE(ENQCMDS)
- NODE_NAME_CASE(VP2INTERSECT)
- NODE_NAME_CASE(VPDPBSUD)
- NODE_NAME_CASE(VPDPBSUDS)
- NODE_NAME_CASE(VPDPBUUD)
- NODE_NAME_CASE(VPDPBUUDS)
- NODE_NAME_CASE(VPDPBSSD)
- NODE_NAME_CASE(VPDPBSSDS)
- NODE_NAME_CASE(VPDPWSUD)
- NODE_NAME_CASE(VPDPWSUDS)
- NODE_NAME_CASE(VPDPWUSD)
- NODE_NAME_CASE(VPDPWUSDS)
- NODE_NAME_CASE(VPDPWUUD)
- NODE_NAME_CASE(VPDPWUUDS)
- NODE_NAME_CASE(VMINMAX)
- NODE_NAME_CASE(VMINMAX_SAE)
- NODE_NAME_CASE(VMINMAXS)
- NODE_NAME_CASE(VMINMAXS_SAE)
- NODE_NAME_CASE(CVTP2IBS)
- NODE_NAME_CASE(CVTP2IUBS)
- NODE_NAME_CASE(CVTP2IBS_RND)
- NODE_NAME_CASE(CVTP2IUBS_RND)
- NODE_NAME_CASE(CVTTP2IBS)
- NODE_NAME_CASE(CVTTP2IUBS)
- NODE_NAME_CASE(CVTTP2IBS_SAE)
- NODE_NAME_CASE(CVTTP2IUBS_SAE)
- NODE_NAME_CASE(VCVT2PH2BF8)
- NODE_NAME_CASE(VCVT2PH2BF8S)
- NODE_NAME_CASE(VCVT2PH2HF8)
- NODE_NAME_CASE(VCVT2PH2HF8S)
- NODE_NAME_CASE(VCVTBIASPH2BF8)
- NODE_NAME_CASE(VCVTBIASPH2BF8S)
- NODE_NAME_CASE(VCVTBIASPH2HF8)
- NODE_NAME_CASE(VCVTBIASPH2HF8S)
- NODE_NAME_CASE(VCVTPH2BF8)
- NODE_NAME_CASE(VCVTPH2BF8S)
- NODE_NAME_CASE(VCVTPH2HF8)
- NODE_NAME_CASE(VCVTPH2HF8S)
- NODE_NAME_CASE(VMCVTBIASPH2BF8)
- NODE_NAME_CASE(VMCVTBIASPH2BF8S)
- NODE_NAME_CASE(VMCVTBIASPH2HF8)
- NODE_NAME_CASE(VMCVTBIASPH2HF8S)
- NODE_NAME_CASE(VMCVTPH2BF8)
- NODE_NAME_CASE(VMCVTPH2BF8S)
- NODE_NAME_CASE(VMCVTPH2HF8)
- NODE_NAME_CASE(VMCVTPH2HF8S)
- NODE_NAME_CASE(VCVTHF82PH)
- NODE_NAME_CASE(AESENC128KL)
- NODE_NAME_CASE(AESDEC128KL)
- NODE_NAME_CASE(AESENC256KL)
- NODE_NAME_CASE(AESDEC256KL)
- NODE_NAME_CASE(AESENCWIDE128KL)
- NODE_NAME_CASE(AESDECWIDE128KL)
- NODE_NAME_CASE(AESENCWIDE256KL)
- NODE_NAME_CASE(AESDECWIDE256KL)
- NODE_NAME_CASE(CMPCCXADD)
- NODE_NAME_CASE(TESTUI)
- NODE_NAME_CASE(FP80_ADD)
- NODE_NAME_CASE(STRICT_FP80_ADD)
- NODE_NAME_CASE(CCMP)
- NODE_NAME_CASE(CTEST)
- NODE_NAME_CASE(CLOAD)
- NODE_NAME_CASE(CSTORE)
- NODE_NAME_CASE(CVTTS2SIS)
- NODE_NAME_CASE(CVTTS2UIS)
- NODE_NAME_CASE(CVTTS2SIS_SAE)
- NODE_NAME_CASE(CVTTS2UIS_SAE)
- NODE_NAME_CASE(CVTTP2SIS)
- NODE_NAME_CASE(MCVTTP2SIS)
- NODE_NAME_CASE(CVTTP2UIS_SAE)
- NODE_NAME_CASE(CVTTP2SIS_SAE)
- NODE_NAME_CASE(CVTTP2UIS)
- NODE_NAME_CASE(MCVTTP2UIS)
- NODE_NAME_CASE(POP_FROM_X87_REG)
+ case X86ISD::FIRST_NUMBER:
+ break;
+#define NODE_NAME_CASE(NODE) \
+ case X86ISD::NODE: \
+ return "X86ISD::" #NODE;
+ NODE_NAME_CASE(BSF)
+ NODE_NAME_CASE(BSR)
+ NODE_NAME_CASE(FSHL)
+ NODE_NAME_CASE(FSHR)
+ NODE_NAME_CASE(FAND)
+ NODE_NAME_CASE(FANDN)
+ NODE_NAME_CASE(FOR)
+ NODE_NAME_CASE(FXOR)
+ NODE_NAME_CASE(FILD)
+ NODE_NAME_CASE(FIST)
+ NODE_NAME_CASE(FP_TO_INT_IN_MEM)
+ NODE_NAME_CASE(FLD)
+ NODE_NAME_CASE(FST)
+ NODE_NAME_CASE(CALL)
+ NODE_NAME_CASE(CALL_RVMARKER)
+ NODE_NAME_CASE(IMP_CALL)
+ NODE_NAME_CASE(BT)
+ NODE_NAME_CASE(CMP)
+ NODE_NAME_CASE(FCMP)
+ NODE_NAME_CASE(STRICT_FCMP)
+ NODE_NAME_CASE(STRICT_FCMPS)
+ NODE_NAME_CASE(COMI)
+ NODE_NAME_CASE(UCOMI)
+ NODE_NAME_CASE(COMX)
+ NODE_NAME_CASE(UCOMX)
+ NODE_NAME_CASE(CMPM)
+ NODE_NAME_CASE(CMPMM)
+ NODE_NAME_CASE(STRICT_CMPM)
+ NODE_NAME_CASE(CMPMM_SAE)
+ NODE_NAME_CASE(SETCC)
+ NODE_NAME_CASE(CTSELECT)
+ NODE_NAME_CASE(SETCC_CARRY)
+ NODE_NAME_CASE(FSETCC)
+ NODE_NAME_CASE(FSETCCM)
+ NODE_NAME_CASE(FSETCCM_SAE)
+ NODE_NAME_CASE(CMOV)
+ NODE_NAME_CASE(BRCOND)
+ NODE_NAME_CASE(RET_GLUE)
+ NODE_NAME_CASE(IRET)
+ NODE_NAME_CASE(REP_STOS)
+ NODE_NAME_CASE(REP_MOVS)
+ NODE_NAME_CASE(GlobalBaseReg)
+ NODE_NAME_CASE(Wrapper)
+ NODE_NAME_CASE(WrapperRIP)
+ NODE_NAME_CASE(MOVQ2DQ)
+ NODE_NAME_CASE(MOVDQ2Q)
+ NODE_NAME_CASE(MMX_MOVD2W)
+ NODE_NAME_CASE(MMX_MOVW2D)
+ NODE_NAME_CASE(PEXTRB)
+ NODE_NAME_CASE(PEXTRW)
+ NODE_NAME_CASE(INSERTPS)
+ NODE_NAME_CASE(PINSRB)
+ NODE_NAME_CASE(PINSRW)
+ NODE_NAME_CASE(PSHUFB)
+ NODE_NAME_CASE(ANDNP)
+ NODE_NAME_CASE(BLENDI)
+ NODE_NAME_CASE(BLENDV)
+ NODE_NAME_CASE(HADD)
+ NODE_NAME_CASE(HSUB)
+ NODE_NAME_CASE(FHADD)
+ NODE_NAME_CASE(FHSUB)
+ NODE_NAME_CASE(CONFLICT)
+ NODE_NAME_CASE(FMAX)
+ NODE_NAME_CASE(FMAXS)
+ NODE_NAME_CASE(FMAX_SAE)
+ NODE_NAME_CASE(FMAXS_SAE)
+ NODE_NAME_CASE(STRICT_FMAX)
+ NODE_NAME_CASE(FMIN)
+ NODE_NAME_CASE(FMINS)
+ NODE_NAME_CASE(FMIN_SAE)
+ NODE_NAME_CASE(FMINS_SAE)
+ NODE_NAME_CASE(STRICT_FMIN)
+ NODE_NAME_CASE(FMAXC)
+ NODE_NAME_CASE(FMINC)
+ NODE_NAME_CASE(FRSQRT)
+ NODE_NAME_CASE(FRCP)
+ NODE_NAME_CASE(EXTRQI)
+ NODE_NAME_CASE(INSERTQI)
+ NODE_NAME_CASE(TLSADDR)
+ NODE_NAME_CASE(TLSBASEADDR)
+ NODE_NAME_CASE(TLSCALL)
+ NODE_NAME_CASE(TLSDESC)
+ NODE_NAME_CASE(EH_SJLJ_SETJMP)
+ NODE_NAME_CASE(EH_SJLJ_LONGJMP)
+ NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
+ NODE_NAME_CASE(EH_RETURN)
+ NODE_NAME_CASE(TC_RETURN)
+ NODE_NAME_CASE(FNSTCW16m)
+ NODE_NAME_CASE(FLDCW16m)
+ NODE_NAME_CASE(FNSTENVm)
+ NODE_NAME_CASE(FLDENVm)
+ NODE_NAME_CASE(LCMPXCHG_DAG)
+ NODE_NAME_CASE(LCMPXCHG8_DAG)
+ NODE_NAME_CASE(LCMPXCHG16_DAG)
+ NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
+ NODE_NAME_CASE(LADD)
+ NODE_NAME_CASE(LSUB)
+ NODE_NAME_CASE(LOR)
+ NODE_NAME_CASE(LXOR)
+ NODE_NAME_CASE(LAND)
+ NODE_NAME_CASE(LBTS)
+ NODE_NAME_CASE(LBTC)
+ NODE_NAME_CASE(LBTR)
+ NODE_NAME_CASE(LBTS_RM)
+ NODE_NAME_CASE(LBTC_RM)
+ NODE_NAME_CASE(LBTR_RM)
+ NODE_NAME_CASE(AADD)
+ NODE_NAME_CASE(AOR)
+ NODE_NAME_CASE(AXOR)
+ NODE_NAME_CASE(AAND)
+ NODE_NAME_CASE(VZEXT_MOVL)
+ NODE_NAME_CASE(VZEXT_LOAD)
+ NODE_NAME_CASE(VEXTRACT_STORE)
+ NODE_NAME_CASE(VTRUNC)
+ NODE_NAME_CASE(VTRUNCS)
+ NODE_NAME_CASE(VTRUNCUS)
+ NODE_NAME_CASE(VMTRUNC)
+ NODE_NAME_CASE(VMTRUNCS)
+ NODE_NAME_CASE(VMTRUNCUS)
+ NODE_NAME_CASE(VTRUNCSTORES)
+ NODE_NAME_CASE(VTRUNCSTOREUS)
+ NODE_NAME_CASE(VMTRUNCSTORES)
+ NODE_NAME_CASE(VMTRUNCSTOREUS)
+ NODE_NAME_CASE(VFPEXT)
+ NODE_NAME_CASE(STRICT_VFPEXT)
+ NODE_NAME_CASE(VFPEXT_SAE)
+ NODE_NAME_CASE(VFPEXTS)
+ NODE_NAME_CASE(VFPEXTS_SAE)
+ NODE_NAME_CASE(VFPROUND)
+ NODE_NAME_CASE(VFPROUND2)
+ NODE_NAME_CASE(VFPROUND2_RND)
+ NODE_NAME_CASE(STRICT_VFPROUND)
+ NODE_NAME_CASE(VMFPROUND)
+ NODE_NAME_CASE(VFPROUND_RND)
+ NODE_NAME_CASE(VFPROUNDS)
+ NODE_NAME_CASE(VFPROUNDS_RND)
+ NODE_NAME_CASE(VSHLDQ)
+ NODE_NAME_CASE(VSRLDQ)
+ NODE_NAME_CASE(VSHL)
+ NODE_NAME_CASE(VSRL)
+ NODE_NAME_CASE(VSRA)
+ NODE_NAME_CASE(VSHLI)
+ NODE_NAME_CASE(VSRLI)
+ NODE_NAME_CASE(VSRAI)
+ NODE_NAME_CASE(VSHLV)
+ NODE_NAME_CASE(VSRLV)
+ NODE_NAME_CASE(VSRAV)
+ NODE_NAME_CASE(VROTLI)
+ NODE_NAME_CASE(VROTRI)
+ NODE_NAME_CASE(VPPERM)
+ NODE_NAME_CASE(CMPP)
+ NODE_NAME_CASE(STRICT_CMPP)
+ NODE_NAME_CASE(PCMPEQ)
+ NODE_NAME_CASE(PCMPGT)
+ NODE_NAME_CASE(PHMINPOS)
+ NODE_NAME_CASE(ADD)
+ NODE_NAME_CASE(SUB)
+ NODE_NAME_CASE(ADC)
+ NODE_NAME_CASE(SBB)
+ NODE_NAME_CASE(SMUL)
+ NODE_NAME_CASE(UMUL)
+ NODE_NAME_CASE(OR)
+ NODE_NAME_CASE(XOR)
+ NODE_NAME_CASE(AND)
+ NODE_NAME_CASE(BEXTR)
+ NODE_NAME_CASE(BEXTRI)
+ NODE_NAME_CASE(BZHI)
+ NODE_NAME_CASE(PDEP)
+ NODE_NAME_CASE(PEXT)
+ NODE_NAME_CASE(MUL_IMM)
+ NODE_NAME_CASE(MOVMSK)
+ NODE_NAME_CASE(PTEST)
+ NODE_NAME_CASE(TESTP)
+ NODE_NAME_CASE(KORTEST)
+ NODE_NAME_CASE(KTEST)
+ NODE_NAME_CASE(KADD)
+ NODE_NAME_CASE(KSHIFTL)
+ NODE_NAME_CASE(KSHIFTR)
+ NODE_NAME_CASE(PACKSS)
+ NODE_NAME_CASE(PACKUS)
+ NODE_NAME_CASE(PALIGNR)
+ NODE_NAME_CASE(VALIGN)
+ NODE_NAME_CASE(VSHLD)
+ NODE_NAME_CASE(VSHRD)
+ NODE_NAME_CASE(PSHUFD)
+ NODE_NAME_CASE(PSHUFHW)
+ NODE_NAME_CASE(PSHUFLW)
+ NODE_NAME_CASE(SHUFP)
+ NODE_NAME_CASE(SHUF128)
+ NODE_NAME_CASE(MOVLHPS)
+ NODE_NAME_CASE(MOVHLPS)
+ NODE_NAME_CASE(MOVDDUP)
+ NODE_NAME_CASE(MOVSHDUP)
+ NODE_NAME_CASE(MOVSLDUP)
+ NODE_NAME_CASE(MOVSD)
+ NODE_NAME_CASE(MOVSS)
+ NODE_NAME_CASE(MOVSH)
+ NODE_NAME_CASE(UNPCKL)
+ NODE_NAME_CASE(UNPCKH)
+ NODE_NAME_CASE(VBROADCAST)
+ NODE_NAME_CASE(VBROADCAST_LOAD)
+ NODE_NAME_CASE(VBROADCASTM)
+ NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
+ NODE_NAME_CASE(VPERMILPV)
+ NODE_NAME_CASE(VPERMILPI)
+ NODE_NAME_CASE(VPERM2X128)
+ NODE_NAME_CASE(VPERMV)
+ NODE_NAME_CASE(VPERMV3)
+ NODE_NAME_CASE(VPERMI)
+ NODE_NAME_CASE(VPTERNLOG)
+ NODE_NAME_CASE(FP_TO_SINT_SAT)
+ NODE_NAME_CASE(FP_TO_UINT_SAT)
+ NODE_NAME_CASE(VFIXUPIMM)
+ NODE_NAME_CASE(VFIXUPIMM_SAE)
+ NODE_NAME_CASE(VFIXUPIMMS)
+ NODE_NAME_CASE(VFIXUPIMMS_SAE)
+ NODE_NAME_CASE(VRANGE)
+ NODE_NAME_CASE(VRANGE_SAE)
+ NODE_NAME_CASE(VRANGES)
+ NODE_NAME_CASE(VRANGES_SAE)
+ NODE_NAME_CASE(PMULUDQ)
+ NODE_NAME_CASE(PMULDQ)
+ NODE_NAME_CASE(PSADBW)
+ NODE_NAME_CASE(DBPSADBW)
+ NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
+ NODE_NAME_CASE(VAARG_64)
+ NODE_NAME_CASE(VAARG_X32)
+ NODE_NAME_CASE(DYN_ALLOCA)
+ NODE_NAME_CASE(MFENCE)
+ NODE_NAME_CASE(SEG_ALLOCA)
+ NODE_NAME_CASE(PROBED_ALLOCA)
+ NODE_NAME_CASE(RDRAND)
+ NODE_NAME_CASE(RDSEED)
+ NODE_NAME_CASE(RDPKRU)
+ NODE_NAME_CASE(WRPKRU)
+ NODE_NAME_CASE(VPMADDUBSW)
+ NODE_NAME_CASE(VPMADDWD)
+ NODE_NAME_CASE(VPSHA)
+ NODE_NAME_CASE(VPSHL)
+ NODE_NAME_CASE(VPCOM)
+ NODE_NAME_CASE(VPCOMU)
+ NODE_NAME_CASE(VPERMIL2)
+ NODE_NAME_CASE(FMSUB)
+ NODE_NAME_CASE(STRICT_FMSUB)
+ NODE_NAME_CASE(FNMADD)
+ NODE_NAME_CASE(STRICT_FNMADD)
+ NODE_NAME_CASE(FNMSUB)
+ NODE_NAME_CASE(STRICT_FNMSUB)
+ NODE_NAME_CASE(FMADDSUB)
+ NODE_NAME_CASE(FMSUBADD)
+ NODE_NAME_CASE(FMADD_RND)
+ NODE_NAME_CASE(FNMADD_RND)
+ NODE_NAME_CASE(FMSUB_RND)
+ NODE_NAME_CASE(FNMSUB_RND)
+ NODE_NAME_CASE(FMADDSUB_RND)
+ NODE_NAME_CASE(FMSUBADD_RND)
+ NODE_NAME_CASE(VFMADDC)
+ NODE_NAME_CASE(VFMADDC_RND)
+ NODE_NAME_CASE(VFCMADDC)
+ NODE_NAME_CASE(VFCMADDC_RND)
+ NODE_NAME_CASE(VFMULC)
+ NODE_NAME_CASE(VFMULC_RND)
+ NODE_NAME_CASE(VFCMULC)
+ NODE_NAME_CASE(VFCMULC_RND)
+ NODE_NAME_CASE(VFMULCSH)
+ NODE_NAME_CASE(VFMULCSH_RND)
+ NODE_NAME_CASE(VFCMULCSH)
+ NODE_NAME_CASE(VFCMULCSH_RND)
+ NODE_NAME_CASE(VFMADDCSH)
+ NODE_NAME_CASE(VFMADDCSH_RND)
+ NODE_NAME_CASE(VFCMADDCSH)
+ NODE_NAME_CASE(VFCMADDCSH_RND)
+ NODE_NAME_CASE(VPMADD52H)
+ NODE_NAME_CASE(VPMADD52L)
+ NODE_NAME_CASE(VRNDSCALE)
+ NODE_NAME_CASE(STRICT_VRNDSCALE)
+ NODE_NAME_CASE(VRNDSCALE_SAE)
+ NODE_NAME_CASE(VRNDSCALES)
+ NODE_NAME_CASE(VRNDSCALES_SAE)
+ NODE_NAME_CASE(VREDUCE)
+ NODE_NAME_CASE(VREDUCE_SAE)
+ NODE_NAME_CASE(VREDUCES)
+ NODE_NAME_CASE(VREDUCES_SAE)
+ NODE_NAME_CASE(VGETMANT)
+ NODE_NAME_CASE(VGETMANT_SAE)
+ NODE_NAME_CASE(VGETMANTS)
+ NODE_NAME_CASE(VGETMANTS_SAE)
+ NODE_NAME_CASE(PCMPESTR)
+ NODE_NAME_CASE(PCMPISTR)
+ NODE_NAME_CASE(XTEST)
+ NODE_NAME_CASE(COMPRESS)
+ NODE_NAME_CASE(EXPAND)
+ NODE_NAME_CASE(SELECTS)
+ NODE_NAME_CASE(ADDSUB)
+ NODE_NAME_CASE(RCP14)
+ NODE_NAME_CASE(RCP14S)
+ NODE_NAME_CASE(RSQRT14)
+ NODE_NAME_CASE(RSQRT14S)
+ NODE_NAME_CASE(FADD_RND)
+ NODE_NAME_CASE(FADDS)
+ NODE_NAME_CASE(FADDS_RND)
+ NODE_NAME_CASE(FSUB_RND)
+ NODE_NAME_CASE(FSUBS)
+ NODE_NAME_CASE(FSUBS_RND)
+ NODE_NAME_CASE(FMUL_RND)
+ NODE_NAME_CASE(FMULS)
+ NODE_NAME_CASE(FMULS_RND)
+ NODE_NAME_CASE(FDIV_RND)
+ NODE_NAME_CASE(FDIVS)
+ NODE_NAME_CASE(FDIVS_RND)
+ NODE_NAME_CASE(FSQRT_RND)
+ NODE_NAME_CASE(FSQRTS)
+ NODE_NAME_CASE(FSQRTS_RND)
+ NODE_NAME_CASE(FGETEXP)
+ NODE_NAME_CASE(FGETEXP_SAE)
+ NODE_NAME_CASE(FGETEXPS)
+ NODE_NAME_CASE(FGETEXPS_SAE)
+ NODE_NAME_CASE(SCALEF)
+ NODE_NAME_CASE(SCALEF_RND)
+ NODE_NAME_CASE(SCALEFS)
+ NODE_NAME_CASE(SCALEFS_RND)
+ NODE_NAME_CASE(MULHRS)
+ NODE_NAME_CASE(SINT_TO_FP_RND)
+ NODE_NAME_CASE(UINT_TO_FP_RND)
+ NODE_NAME_CASE(CVTTP2SI)
+ NODE_NAME_CASE(CVTTP2UI)
+ NODE_NAME_CASE(STRICT_CVTTP2SI)
+ NODE_NAME_CASE(STRICT_CVTTP2UI)
+ NODE_NAME_CASE(MCVTTP2SI)
+ NODE_NAME_CASE(MCVTTP2UI)
+ NODE_NAME_CASE(CVTTP2SI_SAE)
+ NODE_NAME_CASE(CVTTP2UI_SAE)
+ NODE_NAME_CASE(CVTTS2SI)
+ NODE_NAME_CASE(CVTTS2UI)
+ NODE_NAME_CASE(CVTTS2SI_SAE)
+ NODE_NAME_CASE(CVTTS2UI_SAE)
+ NODE_NAME_CASE(CVTSI2P)
+ NODE_NAME_CASE(CVTUI2P)
+ NODE_NAME_CASE(STRICT_CVTSI2P)
+ NODE_NAME_CASE(STRICT_CVTUI2P)
+ NODE_NAME_CASE(MCVTSI2P)
+ NODE_NAME_CASE(MCVTUI2P)
+ NODE_NAME_CASE(VFPCLASS)
+ NODE_NAME_CASE(VFPCLASSS)
+ NODE_NAME_CASE(MULTISHIFT)
+ NODE_NAME_CASE(SCALAR_SINT_TO_FP)
+ NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
+ NODE_NAME_CASE(SCALAR_UINT_TO_FP)
+ NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
+ NODE_NAME_CASE(CVTPS2PH)
+ NODE_NAME_CASE(STRICT_CVTPS2PH)
+ NODE_NAME_CASE(CVTPS2PH_SAE)
+ NODE_NAME_CASE(MCVTPS2PH)
+ NODE_NAME_CASE(MCVTPS2PH_SAE)
+ NODE_NAME_CASE(CVTPH2PS)
+ NODE_NAME_CASE(STRICT_CVTPH2PS)
+ NODE_NAME_CASE(CVTPH2PS_SAE)
+ NODE_NAME_CASE(CVTP2SI)
+ NODE_NAME_CASE(CVTP2UI)
+ NODE_NAME_CASE(MCVTP2SI)
+ NODE_NAME_CASE(MCVTP2UI)
+ NODE_NAME_CASE(CVTP2SI_RND)
+ NODE_NAME_CASE(CVTP2UI_RND)
+ NODE_NAME_CASE(CVTS2SI)
+ NODE_NAME_CASE(CVTS2UI)
+ NODE_NAME_CASE(CVTS2SI_RND)
+ NODE_NAME_CASE(CVTS2UI_RND)
+ NODE_NAME_CASE(CVTNEPS2BF16)
+ NODE_NAME_CASE(MCVTNEPS2BF16)
+ NODE_NAME_CASE(DPBF16PS)
+ NODE_NAME_CASE(DPFP16PS)
+ NODE_NAME_CASE(MPSADBW)
+ NODE_NAME_CASE(LWPINS)
+ NODE_NAME_CASE(MGATHER)
+ NODE_NAME_CASE(MSCATTER)
+ NODE_NAME_CASE(VPDPBUSD)
+ NODE_NAME_CASE(VPDPBUSDS)
+ NODE_NAME_CASE(VPDPWSSD)
+ NODE_NAME_CASE(VPDPWSSDS)
+ NODE_NAME_CASE(VPSHUFBITQMB)
+ NODE_NAME_CASE(GF2P8MULB)
+ NODE_NAME_CASE(GF2P8AFFINEQB)
+ NODE_NAME_CASE(GF2P8AFFINEINVQB)
+ NODE_NAME_CASE(NT_CALL)
+ NODE_NAME_CASE(NT_BRIND)
+ NODE_NAME_CASE(UMWAIT)
+ NODE_NAME_CASE(TPAUSE)
+ NODE_NAME_CASE(ENQCMD)
+ NODE_NAME_CASE(ENQCMDS)
+ NODE_NAME_CASE(VP2INTERSECT)
+ NODE_NAME_CASE(VPDPBSUD)
+ NODE_NAME_CASE(VPDPBSUDS)
+ NODE_NAME_CASE(VPDPBUUD)
+ NODE_NAME_CASE(VPDPBUUDS)
+ NODE_NAME_CASE(VPDPBSSD)
+ NODE_NAME_CASE(VPDPBSSDS)
+ NODE_NAME_CASE(VPDPWSUD)
+ NODE_NAME_CASE(VPDPWSUDS)
+ NODE_NAME_CASE(VPDPWUSD)
+ NODE_NAME_CASE(VPDPWUSDS)
+ NODE_NAME_CASE(VPDPWUUD)
+ NODE_NAME_CASE(VPDPWUUDS)
+ NODE_NAME_CASE(VMINMAX)
+ NODE_NAME_CASE(VMINMAX_SAE)
+ NODE_NAME_CASE(VMINMAXS)
+ NODE_NAME_CASE(VMINMAXS_SAE)
+ NODE_NAME_CASE(CVTP2IBS)
+ NODE_NAME_CASE(CVTP2IUBS)
+ NODE_NAME_CASE(CVTP2IBS_RND)
+ NODE_NAME_CASE(CVTP2IUBS_RND)
+ NODE_NAME_CASE(CVTTP2IBS)
+ NODE_NAME_CASE(CVTTP2IUBS)
+ NODE_NAME_CASE(CVTTP2IBS_SAE)
+ NODE_NAME_CASE(CVTTP2IUBS_SAE)
+ NODE_NAME_CASE(VCVT2PH2BF8)
+ NODE_NAME_CASE(VCVT2PH2BF8S)
+ NODE_NAME_CASE(VCVT2PH2HF8)
+ NODE_NAME_CASE(VCVT2PH2HF8S)
+ NODE_NAME_CASE(VCVTBIASPH2BF8)
+ NODE_NAME_CASE(VCVTBIASPH2BF8S)
+ NODE_NAME_CASE(VCVTBIASPH2HF8)
+ NODE_NAME_CASE(VCVTBIASPH2HF8S)
+ NODE_NAME_CASE(VCVTPH2BF8)
+ NODE_NAME_CASE(VCVTPH2BF8S)
+ NODE_NAME_CASE(VCVTPH2HF8)
+ NODE_NAME_CASE(VCVTPH2HF8S)
+ NODE_NAME_CASE(VMCVTBIASPH2BF8)
+ NODE_NAME_CASE(VMCVTBIASPH2BF8S)
+ NODE_NAME_CASE(VMCVTBIASPH2HF8)
+ NODE_NAME_CASE(VMCVTBIASPH2HF8S)
+ NODE_NAME_CASE(VMCVTPH2BF8)
+ NODE_NAME_CASE(VMCVTPH2BF8S)
+ NODE_NAME_CASE(VMCVTPH2HF8)
+ NODE_NAME_CASE(VMCVTPH2HF8S)
+ NODE_NAME_CASE(VCVTHF82PH)
+ NODE_NAME_CASE(AESENC128KL)
+ NODE_NAME_CASE(AESDEC128KL)
+ NODE_NAME_CASE(AESENC256KL)
+ NODE_NAME_CASE(AESDEC256KL)
+ NODE_NAME_CASE(AESENCWIDE128KL)
+ NODE_NAME_CASE(AESDECWIDE128KL)
+ NODE_NAME_CASE(AESENCWIDE256KL)
+ NODE_NAME_CASE(AESDECWIDE256KL)
+ NODE_NAME_CASE(CMPCCXADD)
+ NODE_NAME_CASE(TESTUI)
+ NODE_NAME_CASE(FP80_ADD)
+ NODE_NAME_CASE(STRICT_FP80_ADD)
+ NODE_NAME_CASE(CCMP)
+ NODE_NAME_CASE(CTEST)
+ NODE_NAME_CASE(CLOAD)
+ NODE_NAME_CASE(CSTORE)
+ NODE_NAME_CASE(CVTTS2SIS)
+ NODE_NAME_CASE(CVTTS2UIS)
+ NODE_NAME_CASE(CVTTS2SIS_SAE)
+ NODE_NAME_CASE(CVTTS2UIS_SAE)
+ NODE_NAME_CASE(CVTTP2SIS)
+ NODE_NAME_CASE(MCVTTP2SIS)
+ NODE_NAME_CASE(CVTTP2UIS_SAE)
+ NODE_NAME_CASE(CVTTP2SIS_SAE)
+ NODE_NAME_CASE(CVTTP2UIS)
+ NODE_NAME_CASE(MCVTTP2UIS)
+ NODE_NAME_CASE(POP_FROM_X87_REG)
}
return nullptr;
#undef NODE_NAME_CASE
@@ -35644,7 +35720,7 @@ bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (AM.HasBaseReg)
return false;
break;
- default: // Other stuff never works.
+ default: // Other stuff never works.
return false;
}
@@ -35749,12 +35825,13 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
if (Val.getOpcode() != ISD::LOAD)
return false;
- if (!VT1.isSimple() || !VT1.isInteger() ||
- !VT2.isSimple() || !VT2.isInteger())
+ if (!VT1.isSimple() || !VT1.isInteger() || !VT2.isSimple() ||
+ !VT2.isInteger())
return false;
switch (VT1.getSimpleVT().SimpleTy) {
- default: break;
+ default:
+ break;
case MVT::i8:
case MVT::i16:
case MVT::i32:
@@ -35985,8 +36062,10 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
// sinkMBB:
// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
- .addReg(mainDstReg).addMBB(mainMBB)
- .addReg(fallDstReg).addMBB(fallMBB);
+ .addReg(mainDstReg)
+ .addMBB(mainMBB)
+ .addReg(fallDstReg)
+ .addMBB(fallMBB);
MI.eraseFromParent();
return sinkMBB;
@@ -36052,8 +36131,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
unsigned TotalNumXMMRegs = 8;
bool UseGPOffset = (ArgMode == 1);
bool UseFPOffset = (ArgMode == 2);
- unsigned MaxOffset = TotalNumIntRegs * 8 +
- (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+ unsigned MaxOffset =
+ TotalNumIntRegs * 8 + (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
/* Align ArgSize to a multiple of 8 */
unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
@@ -36131,13 +36210,14 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
// Check if there is enough room left to pull this argument.
BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
- .addReg(OffsetReg)
- .addImm(MaxOffset + 8 - ArgSizeA8);
+ .addReg(OffsetReg)
+ .addImm(MaxOffset + 8 - ArgSizeA8);
// Branch to "overflowMBB" if offset >= max
// Fall through to "offsetMBB" otherwise
BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
- .addMBB(overflowMBB).addImm(X86::COND_AE);
+ .addMBB(overflowMBB)
+ .addImm(X86::COND_AE);
}
// In offsetMBB, emit code to use the reg_save_area.
@@ -36179,8 +36259,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
// Compute the offset for the next argument
Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
- .addReg(OffsetReg)
- .addImm(UseFPOffset ? 16 : 8);
+ .addReg(OffsetReg)
+ .addImm(UseFPOffset ? 16 : 8);
// Store it back into the va_list.
BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
@@ -36193,8 +36273,7 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
.setMemRefs(StoreOnlyMMO);
// Jump to endMBB
- BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
- .addMBB(endMBB);
+ BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1)).addMBB(endMBB);
}
//
@@ -36235,7 +36314,7 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
.addImm(~(uint64_t)(Alignment.value() - 1));
} else {
BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
- .addReg(OverflowAddrReg);
+ .addReg(OverflowAddrReg);
}
// Compute the next overflow address after this argument.
@@ -36261,10 +36340,11 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
// If we branched, emit the PHI to the front of endMBB.
if (offsetMBB) {
- BuildMI(*endMBB, endMBB->begin(), MIMD,
- TII->get(X86::PHI), DestReg)
- .addReg(OffsetDestReg).addMBB(offsetMBB)
- .addReg(OverflowDestReg).addMBB(overflowMBB);
+ BuildMI(*endMBB, endMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
+ .addReg(OffsetDestReg)
+ .addMBB(offsetMBB)
+ .addReg(OverflowDestReg)
+ .addMBB(overflowMBB);
}
// Erase the pseudo instruction
@@ -36279,8 +36359,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
// kill marker, and set it if it should. Returns the correct kill
// marker value.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
- MachineBasicBlock* BB,
- const TargetRegisterInfo* TRI) {
+ MachineBasicBlock *BB,
+ const TargetRegisterInfo *TRI) {
if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
return false;
@@ -36747,11 +36827,21 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
//
// + ---- <- ------------ <- ------------- <- ------------ +
// | |
- // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
- // | |
- // + <- ----------- <- ------------ <- ----------- <- ------------ +
+ // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn
+ // probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
+ // | |
+ // + <-
+ // -----------
+ // <-
+ // ------------
+ // <-
+ // -----------
+ // <-
+ // ------------
+ // +
//
- // The property we want to enforce is to never have more than [page alloc] between two probes.
+ // The property we want to enforce is to never have more than [page alloc]
+ // between two probes.
const unsigned XORMIOpc =
TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
@@ -36843,56 +36933,61 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
// Add code to the main basic block to check if the stack limit has been hit,
// and if so, jump to mallocMBB otherwise to bumpMBB.
BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
- BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
- .addReg(tmpSPVReg).addReg(sizeVReg);
- BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
- .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
- .addReg(SPLimitVReg);
+ BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr : X86::SUB32rr), SPLimitVReg)
+ .addReg(tmpSPVReg)
+ .addReg(sizeVReg);
+ BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr : X86::CMP32mr))
+ .addReg(0)
+ .addImm(1)
+ .addReg(0)
+ .addImm(TlsOffset)
+ .addReg(TlsReg)
+ .addReg(SPLimitVReg);
BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
// bumpMBB simply decreases the stack pointer, since we know the current
// stacklet has enough space.
BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
- .addReg(SPLimitVReg);
+ .addReg(SPLimitVReg);
BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
- .addReg(SPLimitVReg);
+ .addReg(SPLimitVReg);
BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Calls into a routine in libgcc to allocate more space from the heap.
const uint32_t *RegMask =
Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
if (IsLP64) {
- BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
- .addReg(sizeVReg);
+ BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI).addReg(sizeVReg);
BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
- .addExternalSymbol("__morestack_allocate_stack_space")
- .addRegMask(RegMask)
- .addReg(X86::RDI, RegState::Implicit)
- .addReg(X86::RAX, RegState::ImplicitDefine);
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::RDI, RegState::Implicit)
+ .addReg(X86::RAX, RegState::ImplicitDefine);
} else if (Is64Bit) {
- BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
- .addReg(sizeVReg);
+ BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI).addReg(sizeVReg);
BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
- .addExternalSymbol("__morestack_allocate_stack_space")
- .addRegMask(RegMask)
- .addReg(X86::EDI, RegState::Implicit)
- .addReg(X86::EAX, RegState::ImplicitDefine);
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::EDI, RegState::Implicit)
+ .addReg(X86::EAX, RegState::ImplicitDefine);
} else {
- BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
- .addImm(12);
+ BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg)
+ .addReg(physSPReg)
+ .addImm(12);
BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
- .addExternalSymbol("__morestack_allocate_stack_space")
- .addRegMask(RegMask)
- .addReg(X86::EAX, RegState::ImplicitDefine);
+ .addExternalSymbol("__morestack_allocate_stack_space")
+ .addRegMask(RegMask)
+ .addReg(X86::EAX, RegState::ImplicitDefine);
}
if (!Is64Bit)
- BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
- .addImm(16);
+ BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg)
+ .addReg(physSPReg)
+ .addImm(16);
BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
- .addReg(IsLP64 ? X86::RAX : X86::EAX);
+ .addReg(IsLP64 ? X86::RAX : X86::EAX);
BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
// Set up the CFG correctly.
@@ -36947,7 +37042,8 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
RestoreMBB->setIsEHPad(true);
auto RestoreMBBI = RestoreMBB->begin();
- BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
+ BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4))
+ .addMBB(TargetMBB);
return BB;
}
@@ -36969,9 +37065,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
// FIXME: The 32-bit calls have non-standard calling conventions. Use a
// proper register mask.
const uint32_t *RegMask =
- Subtarget.is64Bit() ?
- Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
- Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
+ Subtarget.is64Bit()
+ ? Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask()
+ : Subtarget.getRegisterInfo()->getCallPreservedMask(*F,
+ CallingConv::C);
if (Subtarget.is64Bit()) {
MachineInstrBuilder MIB =
BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
@@ -37227,8 +37324,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
MemOpndSlot = CurOp;
MVT PVT = getPointerTy(MF->getDataLayout());
- assert((PVT == MVT::i64 || PVT == MVT::i32) &&
- "Invalid Pointer Size!");
+ assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
// For v = setjmp(buf), we generate
//
@@ -37276,19 +37372,19 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
LabelReg = MRI.createVirtualRegister(PtrRC);
if (Subtarget.is64Bit()) {
MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
- .addReg(X86::RIP)
- .addImm(0)
- .addReg(0)
- .addMBB(restoreMBB)
- .addReg(0);
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addMBB(restoreMBB)
+ .addReg(0);
} else {
- const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+ const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
- .addReg(XII->getGlobalBaseReg(MF))
- .addImm(0)
- .addReg(0)
- .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
- .addReg(0);
+ .addReg(XII->getGlobalBaseReg(MF))
+ .addImm(0)
+ .addReg(0)
+ .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
+ .addReg(0);
}
} else
PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
@@ -37312,7 +37408,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
// Setup
MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
- .addMBB(restoreMBB);
+ .addMBB(restoreMBB);
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
MIB.addRegMask(RegInfo->getNoPreservedMask());
@@ -37339,9 +37435,9 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
Register FramePtr = RegInfo->getFrameRegister(*MF);
Register BasePtr = RegInfo->getBaseRegister();
unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
- addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
- FramePtr, true, X86FI->getRestoreBasePointerOffset())
- .setMIFlag(MachineInstr::FrameSetup);
+ addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr), FramePtr,
+ true, X86FI->getRestoreBasePointerOffset())
+ .setMIFlag(MachineInstr::FrameSetup);
}
BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
@@ -37424,9 +37520,9 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
if (PVT == MVT::i64) {
Register TmpZReg = MRI.createVirtualRegister(PtrRC);
BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
- .addImm(0)
- .addReg(ZReg)
- .addImm(X86::sub_32bit);
+ .addImm(0)
+ .addReg(ZReg)
+ .addImm(X86::sub_32bit);
ZReg = TmpZReg;
}
@@ -37557,11 +37653,10 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
MVT PVT = getPointerTy(MF->getDataLayout());
- assert((PVT == MVT::i64 || PVT == MVT::i32) &&
- "Invalid Pointer Size!");
+ assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
const TargetRegisterClass *RC =
- (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+ (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
Register Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -37944,10 +38039,8 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
/// This approach ensures that when i64 is type-legalized into two i32
/// operations, both operations share the same condition byte rather than
/// each independently reading (and destroying) EFLAGS.
-static MachineBasicBlock *
-emitCTSelectI386WithConditionMaterialization(MachineInstr &MI,
- MachineBasicBlock *BB,
- unsigned InternalPseudoOpcode) {
+static MachineBasicBlock *emitCTSelectI386WithConditionMaterialization(
+ MachineInstr &MI, MachineBasicBlock *BB, unsigned InternalPseudoOpcode) {
const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
const MIMetadata MIMD(MI);
MachineFunction *MF = BB->getParent();
@@ -37991,12 +38084,12 @@ emitCTSelectI386WithConditionMaterialization(MachineInstr &MI,
}
BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode))
- .addDef(DstReg) // dst (output)
- .addDef(TmpByteReg) // tmp_byte (output)
- .addDef(TmpMaskReg) // tmp_mask (output)
- .addReg(Src1Reg) // src1 (input)
- .addReg(Src2Reg) // src2 (input)
- .addReg(CondByteReg); // pre-materialized condition byte (input)
+ .addDef(DstReg) // dst (output)
+ .addDef(TmpByteReg) // tmp_byte (output)
+ .addDef(TmpMaskReg) // tmp_mask (output)
+ .addReg(Src1Reg) // src1 (input)
+ .addReg(Src2Reg) // src2 (input)
+ .addReg(CondByteReg); // pre-materialized condition byte (input)
MI.eraseFromParent();
return BB;
@@ -38022,8 +38115,8 @@ struct FPLoadMemOperands {
// Check if a virtual register is defined by a simple FP load instruction
// Returns the memory operands if it's a simple load, otherwise returns invalid
static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
- MachineRegisterInfo &MRI,
- unsigned ExpectedLoadOpcode) {
+ MachineRegisterInfo &MRI,
+ unsigned ExpectedLoadOpcode) {
FPLoadMemOperands Result;
if (!Reg.isVirtual())
@@ -38042,9 +38135,9 @@ static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
if (DefMI->hasOrderedMemoryRef())
return Result;
- // The load should have a single def (the destination register) and memory operands
- // Format: %reg = LD_Fpxxm <fi#N>, 1, %noreg, 0, %noreg
- // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment
+ // The load should have a single def (the destination register) and memory
+ // operands Format: %reg = LD_Fpxxm <fi#N>, 1, %noreg, 0, %noreg or: %reg =
+ // LD_Fpxxm %base, scale, %index, disp, %segment
if (DefMI->getNumOperands() < 6)
return Result;
@@ -38069,9 +38162,8 @@ static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
// Check if this is a constant pool load
// Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg
- if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
- ScaleMO.isImm() && IndexMO.isReg() &&
- IndexMO.getReg() == X86::NoRegister &&
+ if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && ScaleMO.isImm() &&
+ IndexMO.isReg() && IndexMO.getReg() == X86::NoRegister &&
DispMO.isCPI() && SegMO.isReg()) {
Result.IsValid = true;
Result.IsConstantPool = true;
@@ -38085,9 +38177,8 @@ static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
// Check if this is a global variable load
// Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg
- if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
- ScaleMO.isImm() && IndexMO.isReg() &&
- IndexMO.getReg() == X86::NoRegister &&
+ if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && ScaleMO.isImm() &&
+ IndexMO.isReg() && IndexMO.getReg() == X86::NoRegister &&
DispMO.isGlobal() && SegMO.isReg()) {
Result.IsValid = true;
Result.IsGlobal = true;
@@ -38101,8 +38192,8 @@ static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
}
// Regular memory operands (e.g., pointer loads)
- if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() &&
- DispMO.isImm() && SegMO.isReg()) {
+ if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() && DispMO.isImm() &&
+ SegMO.isReg()) {
Result.IsValid = true;
Result.IsFrameIndex = false;
Result.IsConstantPool = false;
@@ -38128,7 +38219,8 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
unsigned RegSizeInByte = 4;
// Get operands
- // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80, %cond:i8imm
+ // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80,
+ // %cond:i8imm
unsigned DestReg = MI.getOperand(0).getReg();
unsigned FalseReg = MI.getOperand(1).getReg();
unsigned TrueReg = MI.getOperand(2).getReg();
@@ -38146,7 +38238,7 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
// Helper to load integer from memory operands
auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps,
- unsigned Offset) -> unsigned {
+ unsigned Offset) -> unsigned {
unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
MachineInstrBuilder MIB =
BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg);
@@ -38162,18 +38254,21 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
// Constant pool: base_reg + scale + index + CP_index + segment
// MOV32rm format: base, scale, index, displacement, segment
MIB.addReg(X86::NoRegister) // Base register
- .addImm(MemOps.ScaleVal) // Scale
- .addReg(MemOps.IndexReg) // Index register
- .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset) // Displacement (CP index)
- .addReg(MemOps.SegReg); // Segment
+ .addImm(MemOps.ScaleVal) // Scale
+ .addReg(MemOps.IndexReg) // Index register
+ .addConstantPoolIndex(MemOps.ConstantPoolIndex,
+ Offset) // Displacement (CP index)
+ .addReg(MemOps.SegReg); // Segment
} else if (MemOps.IsGlobal) {
// Global variable: base_reg + scale + index + global + segment
// MOV32rm format: base, scale, index, displacement, segment
MIB.addReg(X86::NoRegister) // Base register
- .addImm(MemOps.ScaleVal) // Scale
- .addReg(MemOps.IndexReg) // Index register
- .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset) // Displacement (global address)
- .addReg(MemOps.SegReg); // Segment
+ .addImm(MemOps.ScaleVal) // Scale
+ .addReg(MemOps.IndexReg) // Index register
+ .addGlobalAddress(MemOps.Global,
+ MemOps.GlobalOffset +
+ Offset) // Displacement (global address)
+ .addReg(MemOps.SegReg); // Segment
} else {
// Regular memory: base_reg + scale + index + disp + segment
MIB.addReg(MemOps.BaseReg)
@@ -38188,45 +38283,47 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
// Optimized path: load integers directly from memory when both operands are
// memory loads, avoiding FP register round-trip
- auto emitCtSelectFromMemory = [&](unsigned NumValues,
- const FPLoadMemOperands &TrueMemOps,
- const FPLoadMemOperands &FalseMemOps,
- int ResultSlot) {
- for (unsigned Val = 0; Val < NumValues; ++Val) {
- unsigned Offset = Val * RegSizeInByte;
-
- // Load true and false values directly from their memory locations as integers
- unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset);
- unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset);
-
- // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
- unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
- unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
- unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
-
- BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
- .addDef(ResultIntReg) // dst (output)
- .addDef(TmpByteReg) // tmp_byte (output)
- .addDef(TmpMaskReg) // tmp_mask (output)
- .addReg(FalseIntReg) // src1 (input) - false value
- .addReg(TrueIntReg) // src2 (input) - true value
- .addReg(CondByteReg); // pre-materialized condition byte (input)
-
- // Store result back to result slot
- BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
- .addFrameIndex(ResultSlot)
- .addImm(1)
- .addReg(0)
- .addImm(Offset)
- .addReg(0)
- .addReg(ResultIntReg, RegState::Kill);
- }
- };
+ auto emitCtSelectFromMemory =
+ [&](unsigned NumValues, const FPLoadMemOperands &TrueMemOps,
+ const FPLoadMemOperands &FalseMemOps, int ResultSlot) {
+ for (unsigned Val = 0; Val < NumValues; ++Val) {
+ unsigned Offset = Val * RegSizeInByte;
+
+ // Load true and false values directly from their memory locations as
+ // integers
+ unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset);
+ unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset);
+
+ // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time
+ // selection
+ unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+ unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+
+ BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
+ .addDef(ResultIntReg) // dst (output)
+ .addDef(TmpByteReg) // tmp_byte (output)
+ .addDef(TmpMaskReg) // tmp_mask (output)
+ .addReg(FalseIntReg) // src1 (input) - false value
+ .addReg(TrueIntReg) // src2 (input) - true value
+ .addReg(CondByteReg); // pre-materialized condition byte (input)
+
+ // Store result back to result slot
+ BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+ .addFrameIndex(ResultSlot)
+ .addImm(1)
+ .addReg(0)
+ .addImm(Offset)
+ .addReg(0)
+ .addReg(ResultIntReg, RegState::Kill);
+ }
+ };
- auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) {
+ auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot,
+ int FalseSlot, int ResultSlot) {
for (unsigned Val = 0; Val < NumValues; ++Val) {
unsigned Offset = Val * RegSizeInByte;
-
+
// Load true and false values from stack as 32-bit integers
unsigned TrueIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueIntReg)
@@ -38244,18 +38341,19 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
.addImm(Offset)
.addReg(0);
- // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+ // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time
+ // selection
unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
-
+
BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
- .addDef(ResultIntReg) // dst (output)
- .addDef(TmpByteReg) // tmp_byte (output)
- .addDef(TmpMaskReg) // tmp_mask (output)
- .addReg(FalseIntReg) // src1 (input) - false value
- .addReg(TrueIntReg) // src2 (input) - true value
- .addReg(CondByteReg); // pre-materialized condition byte (input)
+ .addDef(ResultIntReg) // dst (output)
+ .addDef(TmpByteReg) // tmp_byte (output)
+ .addDef(TmpMaskReg) // tmp_mask (output)
+ .addReg(FalseIntReg) // src1 (input) - false value
+ .addReg(TrueIntReg) // src2 (input) - true value
+ .addReg(CondByteReg); // pre-materialized condition byte (input)
// Store result back to result slot
BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
@@ -38416,7 +38514,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
const MIMetadata MIMD(MI);
auto TMMImmToTMMReg = [](unsigned Imm) {
- assert (Imm < 8 && "Illegal tmm index");
+ assert(Imm < 8 && "Illegal tmm index");
return X86::TMM0 + Imm;
};
switch (MI.getOpcode()) {
@@ -38483,7 +38581,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP64rr);
case X86::CTSELECT_I386_FP80rr:
return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP80rr);
-
+
case X86::FP80_ADDr:
case X86::FP80_ADDm32: {
// Change the floating point control register to use double extended
@@ -38571,29 +38669,30 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
- .addReg(OldCW, RegState::Kill).addImm(0xC00);
+ .addReg(OldCW, RegState::Kill)
+ .addImm(0xC00);
// Extract to 16 bits.
Register NewCW16 =
MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
- .addReg(NewCW, RegState::Kill, X86::sub_16bit);
+ .addReg(NewCW, RegState::Kill, X86::sub_16bit);
// Prepare memory for FLDCW.
int NewCWFrameIdx =
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
NewCWFrameIdx)
- .addReg(NewCW16, RegState::Kill);
+ .addReg(NewCW16, RegState::Kill);
// Reload the modified control word now...
- addFrameReference(BuildMI(*BB, MI, MIMD,
- TII->get(X86::FLDCW16m)), NewCWFrameIdx);
+ addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
+ NewCWFrameIdx);
// Get the X86 opcode to use.
unsigned Opc;
switch (MI.getOpcode()) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("illegal opcode!");
case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
@@ -38604,7 +38703,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
- // clang-format on
+ // clang-format on
}
X86AddressMode AM = getAddressFromInstr(&MI, 0);
@@ -38821,7 +38920,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::PTMMULTF32PS: {
unsigned Opc;
switch (MI.getOpcode()) {
- default: llvm_unreachable("illegal opcode!");
+ default:
+ llvm_unreachable("illegal opcode!");
// clang-format off
case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
@@ -38868,7 +38968,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::PTILESTORED: {
unsigned Opc;
switch (MI.getOpcode()) {
- default: llvm_unreachable("illegal opcode!");
+ default:
+ llvm_unreachable("illegal opcode!");
#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
case X86::PTILELOADD:
Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
@@ -38990,11 +39091,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// X86 Optimization Hooks
//===----------------------------------------------------------------------===//
-bool
-X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
- const APInt &DemandedBits,
- const APInt &DemandedElts,
- TargetLoweringOpt &TLO) const {
+bool X86TargetLowering::targetShrinkDemandedConstant(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ TargetLoweringOpt &TLO) const {
EVT VT = Op.getValueType();
unsigned Opcode = Op.getOpcode();
unsigned EltSize = VT.getScalarSizeInBits();
@@ -39179,16 +39278,15 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
unsigned NumElts = DemandedElts.getBitWidth();
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
- assert((Opc >= ISD::BUILTIN_OP_END ||
- Opc == ISD::INTRINSIC_WO_CHAIN ||
- Opc == ISD::INTRINSIC_W_CHAIN ||
- Opc == ISD::INTRINSIC_VOID) &&
+ assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN ||
+ Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) &&
"Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!");
Known.resetAll();
switch (Opc) {
- default: break;
+ default:
+ break;
case X86ISD::MUL_IMM: {
KnownBits Known2;
Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -39417,7 +39515,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
- if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+ if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
@@ -39611,7 +39709,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
unsigned NumElts = VT.getVectorNumElements();
if (Mask.size() == NumElts) {
SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
- Known.Zero.setAllBits(); Known.One.setAllBits();
+ Known.Zero.setAllBits();
+ Known.One.setAllBits();
for (unsigned i = 0; i != NumElts; ++i) {
if (!DemandedElts[i])
continue;
@@ -39756,16 +39855,18 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
case X86ISD::ANDNP: {
unsigned Tmp0 =
DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
- if (Tmp0 == 1) return 1; // Early out.
+ if (Tmp0 == 1)
+ return 1; // Early out.
unsigned Tmp1 =
DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
return std::min(Tmp0, Tmp1);
}
case X86ISD::CMOV: {
- unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
- if (Tmp0 == 1) return 1; // Early out.
- unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
+ unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ if (Tmp0 == 1)
+ return 1; // Early out.
+ unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
return std::min(Tmp0, Tmp1);
}
}
@@ -40141,7 +40242,6 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
PermuteImm = (unsigned)ShiftAmt;
return true;
}
-
}
}
@@ -40201,7 +40301,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
- ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
+ ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) &&
+ Subtarget.hasInt256()) ||
((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
Subtarget)) {
@@ -40760,9 +40861,9 @@ static SDValue combineX86ShuffleChain(
SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
- CanonicalizeShuffleInput(RootVT, LHS),
- CanonicalizeShuffleInput(RootVT, RHS),
- DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ CanonicalizeShuffleInput(RootVT, LHS),
+ CanonicalizeShuffleInput(RootVT, RHS),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
}
}
}
@@ -40856,8 +40957,8 @@ static SDValue combineX86ShuffleChain(
}
if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
- AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
- PermuteImm) &&
+ AllowIntDomain, DAG, Subtarget, Shuffle,
+ ShuffleVT, PermuteImm) &&
(!IsMaskedShuffle ||
(NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && RootOpc == Shuffle)
@@ -41736,11 +41837,9 @@ static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef<SDValue> Ops,
}
namespace llvm {
- namespace X86 {
- enum {
- MaxShuffleCombineDepth = 8
- };
- } // namespace X86
+namespace X86 {
+enum { MaxShuffleCombineDepth = 8 };
+} // namespace X86
} // namespace llvm
/// Fully generic combining of x86 shuffle instructions.
@@ -42144,7 +42243,8 @@ static SDValue combineX86ShufflesRecursively(
// The Op itself may be of different VT, so we need to scale the mask.
unsigned NumOpElts = Op.getValueType().getVectorNumElements();
- APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
+ APInt OpScaledDemandedElts =
+ APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
// Can this operand be simplified any further, given it's demanded elements?
if (SDValue NewOp = TLI.SimplifyMultipleUseDemandedVectorElts(
@@ -42950,7 +43050,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
ISD::isNormalLoad(Src.getNode())) {
LoadSDNode *LN = cast<LoadSDNode>(Src);
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
SDValue BcastLd =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
LN->getMemoryVT(), LN->getMemOperand());
@@ -42982,7 +43082,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
// Unless its volatile or atomic.
if (LN->isSimple()) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
SDValue BcastLd = DAG.getMemIntrinsicNode(
X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
LN->getPointerInfo(), LN->getBaseAlign(),
@@ -43000,7 +43100,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
if (LN->getMemoryVT().getSizeInBits() == 16) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
SDValue BcastLd =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
LN->getMemoryVT(), LN->getMemOperand());
@@ -43027,7 +43127,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
SDValue Ptr = DAG.getMemBasePlusOffset(
LN->getBasePtr(), TypeSize::getFixed(Offset), DL);
- SDValue Ops[] = { LN->getChain(), Ptr };
+ SDValue Ops[] = {LN->getChain(), Ptr};
SDValue BcastLd = DAG.getMemIntrinsicNode(
X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
LN->getPointerInfo().getWithOffset(Offset), LN->getBaseAlign(),
@@ -43045,7 +43145,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
SDValue BcastLd =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
LN->getMemoryVT(), LN->getMemOperand());
@@ -43554,13 +43654,13 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
- SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
- MemIntr->getBasePtr(),
- MemIntr->getMemOperand());
- SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
- DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
- Load),
- DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
+ SDValue Load =
+ DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
+ MemIntr->getBasePtr(), MemIntr->getMemOperand());
+ SDValue Insert = DAG.getNode(
+ X86ISD::INSERTPS, DL, VT, Op0,
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Load),
+ DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
return Insert;
}
@@ -43714,8 +43814,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
(V.getOpcode() == X86ISD::PSHUFLW ||
V.getOpcode() == X86ISD::PSHUFHW) &&
- V.getOpcode() != N.getOpcode() &&
- V.hasOneUse() && V.getOperand(0).hasOneUse()) {
+ V.getOpcode() != N.getOpcode() && V.hasOneUse() &&
+ V.getOperand(0).hasOneUse()) {
SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
if (D.getOpcode() == X86ISD::PSHUFD) {
SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
@@ -43789,11 +43889,11 @@ static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
/// are written to the parameters \p Opnd0 and \p Opnd1.
///
-/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
-/// so it is easier to generically match. We also insert dummy vector shuffle
-/// nodes for the operands which explicitly discard the lanes which are unused
-/// by this operation to try to flow through the rest of the combiner
-/// the fact that they're unused.
+/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle
+/// nodes so it is easier to generically match. We also insert dummy vector
+/// shuffle nodes for the operands which explicitly discard the lanes which are
+/// unused by this operation to try to flow through the rest of the combiner the
+/// fact that they're unused.
static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
bool &IsSubAdd, bool &HasAllowContract) {
@@ -43827,13 +43927,15 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
// commute the FADD operands.
SDValue LHS, RHS;
if (V1.getOpcode() == ISD::FSUB) {
- LHS = V1->getOperand(0); RHS = V1->getOperand(1);
+ LHS = V1->getOperand(0);
+ RHS = V1->getOperand(1);
if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
return false;
} else {
assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
- LHS = V2->getOperand(0); RHS = V2->getOperand(1);
+ LHS = V2->getOperand(0);
+ RHS = V2->getOperand(1);
if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
(V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
return false;
@@ -43845,8 +43947,8 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
return false;
// It's a subadd if the vector in the even parity is an FADD.
- IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
- : V2->getOpcode() == ISD::FADD;
+ IsSubAdd =
+ Op0Even ? V1->getOpcode() == ISD::FADD : V2->getOpcode() == ISD::FADD;
HasAllowContract =
V1->getFlags().hasAllowContract() && V2->getFlags().hasAllowContract();
@@ -44135,7 +44237,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// TODO: Multiply by zero.
- // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
+ // If RHS/LHS elements are known zero then we don't need the LHS/RHS
+ // equivalent.
APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
Depth + 1))
@@ -44909,7 +45012,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// For splats, unless we *only* demand the 0'th element,
// stop attempts at simplification here, we aren't going to improve things,
// this is better than any potential shuffle.
- if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
+ if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/ false))
return false;
// Get target/faux shuffle mask.
@@ -45007,7 +45110,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
EVT VT = Op.getValueType();
unsigned BitWidth = OriginalDemandedBits.getBitWidth();
unsigned Opc = Op.getOpcode();
- switch(Opc) {
+ switch (Opc) {
case X86ISD::VTRUNC: {
KnownBits KnownOp;
SDValue Src = Op.getOperand(0);
@@ -45015,8 +45118,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
// Simplify the input, using demanded bit information.
APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
- APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
- if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
+ APInt DemandedElts =
+ OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
+ if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO,
+ Depth + 1))
return true;
break;
}
@@ -45120,7 +45225,8 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
}
- // If we are only demanding sign bits then we can use the shift source directly.
+ // If we are only demanding sign bits then we can use the shift source
+ // directly.
unsigned NumSignBits =
TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
@@ -45311,8 +45417,8 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
return true;
KnownBits KnownVec;
- if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
- KnownVec, TLO, Depth + 1))
+ if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts, KnownVec,
+ TLO, Depth + 1))
return true;
if (SDValue V = SimplifyMultipleUseDemandedBits(
@@ -45948,13 +46054,13 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
static unsigned getAltBitOpcode(unsigned Opcode) {
- switch(Opcode) {
- // clang-format off
+ switch (Opcode) {
+ // clang-format off
case ISD::AND: return X86ISD::FAND;
case ISD::OR: return X86ISD::FOR;
case ISD::XOR: return X86ISD::FXOR;
case X86ISD::ANDNP: return X86ISD::FANDN;
- // clang-format on
+ // clang-format on
}
llvm_unreachable("Unknown bitwise opcode");
}
@@ -46177,8 +46283,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
// Convert a vXi1 constant build vector to the same width scalar integer.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
EVT SrcVT = Op.getValueType();
- assert(SrcVT.getVectorElementType() == MVT::i1 &&
- "Expected a vXi1 vector");
+ assert(SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector");
assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
"Expected a constant build vector");
@@ -46496,7 +46601,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// and the vbroadcast_load are both integer or both fp. In some cases this
// will remove the bitcast entirely.
if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
- VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
+ VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
auto *BCast = cast<MemIntrinsicSDNode>(N0);
unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
@@ -46509,7 +46614,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
- SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
+ SDValue Ops[] = {BCast->getChain(), BCast->getBasePtr()};
SDValue ResNode =
DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
MemVT, BCast->getMemOperand());
@@ -46559,7 +46664,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
bool LowUndef = true, AllUndefOrZero = true;
for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
SDValue Op = N0.getOperand(i);
- LowUndef &= Op.isUndef() || (i >= e/2);
+ LowUndef &= Op.isUndef() || (i >= e / 2);
AllUndefOrZero &= isNullConstantOrUndef(Op);
}
if (AllUndefOrZero) {
@@ -46601,8 +46706,8 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// Try to remove a bitcast of constant vXi1 vector. We have to legalize
// most of these to scalar anyway.
- if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
- SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+ if (Subtarget.hasAVX512() && VT.isScalarInteger() && SrcVT.isVector() &&
+ SrcVT.getVectorElementType() == MVT::i1 &&
ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
return combinevXi1ConstantToInteger(N0, DAG);
}
@@ -46620,8 +46725,8 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
// Turn it into a sign bit compare that produces a k-register. This avoids
// a trip through a GPR.
- if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
- VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
+ VT.getVectorElementType() == MVT::i1 &&
isPowerOf2_32(VT.getVectorNumElements())) {
unsigned NumElts = VT.getVectorNumElements();
SDValue Src = N0;
@@ -46675,12 +46780,12 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// transferring the SSE operand to integer register and back.
unsigned FPOpcode;
switch (N0.getOpcode()) {
- // clang-format off
+ // clang-format off
case ISD::AND: FPOpcode = X86ISD::FAND; break;
case ISD::OR: FPOpcode = X86ISD::FOR; break;
case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
default: return SDValue();
- // clang-format on
+ // clang-format on
}
// Check if we have a bitcast from another integer type as well.
@@ -46781,7 +46886,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
// Actually build the DotProduct, split as 256/512 bits for
// AVXVNNI/AVX512VNNI.
auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
- ArrayRef<SDValue> Ops) {
+ ArrayRef<SDValue> Ops) {
MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
};
@@ -46896,7 +47001,8 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
DAG.getVectorIdxConstant(0, DL));
}
-// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
+// Attempt to replace an all_of/any_of/parity style horizontal reduction with a
+// MOVMSK.
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Bail without SSE2.
@@ -47171,9 +47277,9 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
if (Stages > 3) {
unsigned SadElems = SadVT.getVectorNumElements();
- for(unsigned i = Stages - 3; i > 0; --i) {
+ for (unsigned i = Stages - 3; i > 0; --i) {
SmallVector<int, 16> Mask(SadElems, -1);
- for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
+ for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
Mask[j] = MaskEnd + j;
SDValue Shuffle =
@@ -47489,10 +47595,10 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
Vec.getOperand(0).getValueType().getScalarType(),
Vec.getOperand(0), Index);
- SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
- Vec.getOperand(1), Index);
- SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
- Vec.getOperand(2), Index);
+ SDValue Ext1 =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Vec.getOperand(1), Index);
+ SDValue Ext2 =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Vec.getOperand(2), Index);
return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
}
@@ -47772,8 +47878,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
}
- // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
- // Improves lowering of bool masks on rust which splits them into byte array.
+ // Convert extract_element(bitcast(<X x i1>) ->
+ // bitcast(extract_subvector()). Improves lowering of bool masks on rust
+ // which splits them into byte array.
if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
SDValue Src = peekThroughBitcasts(InputVector);
if (Src.getValueType().getScalarType() == MVT::i1 &&
@@ -48123,8 +48230,7 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Cond = N->getOperand(0);
- if ((N->getOpcode() != ISD::VSELECT &&
- N->getOpcode() != X86ISD::BLENDV) ||
+ if ((N->getOpcode() != ISD::VSELECT && N->getOpcode() != X86ISD::BLENDV) ||
ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
@@ -48397,7 +48503,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Check for x CC y ? x : y.
if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
switch (CC) {
- default: break;
+ default:
+ break;
case ISD::SETULT:
// Converting this to a min would handle NaNs incorrectly, and swapping
// the operands would cause it to handle comparisons between positive
@@ -48462,10 +48569,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
Opcode = X86ISD::FMAX;
break;
}
- // Check for x CC y ? y : x -- a min/max with reversed arms.
+ // Check for x CC y ? y : x -- a min/max with reversed arms.
} else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
switch (CC) {
- default: break;
+ default:
+ break;
case ISD::SETOGE:
// Converting this to a min would handle comparisons between positive
// and negative zero incorrectly, and swapping the operands would
@@ -48669,13 +48777,13 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
Cond1 == InnerSetCC.getOperand(1)) {
ISD::CondCode NewCC;
switch (CC == ISD::SETEQ ? InnerCC : CC) {
- // clang-format off
+ // clang-format off
case ISD::SETGT: NewCC = ISD::SETGE; break;
case ISD::SETLT: NewCC = ISD::SETLE; break;
case ISD::SETUGT: NewCC = ISD::SETUGE; break;
case ISD::SETULT: NewCC = ISD::SETULE; break;
default: NewCC = ISD::SETCC_INVALID; break;
- // clang-format on
+ // clang-format on
}
if (NewCC != ISD::SETCC_INVALID) {
Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
@@ -48845,9 +48953,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// 16-bit lacks a proper blendv.
unsigned EltBitWidth = VT.getScalarSizeInBits();
bool CanShiftBlend =
- TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
- (Subtarget.hasAVX2() && EltBitWidth == 64) ||
- (Subtarget.hasXOP()));
+ TLI.isTypeLegal(VT) &&
+ ((Subtarget.hasAVX() && EltBitWidth == 32) ||
+ (Subtarget.hasAVX2() && EltBitWidth == 64) || (Subtarget.hasXOP()));
if (CanShiftBlend &&
ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
return C->getAPIntValue().isPowerOf2();
@@ -49086,7 +49194,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
SDValue Op2 = Cmp.getOperand(1);
SDValue SetCC;
- const ConstantSDNode* C = nullptr;
+ const ConstantSDNode *C = nullptr;
bool needOppositeCond = (CC == X86::COND_E);
bool checkAgainstTrue = false; // Is it a comparison against 1?
@@ -49107,8 +49215,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
bool truncatedToBoolWithAnd = false;
// Skip (zext $x), (trunc $x), or (and $x, 1) node.
while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
- SetCC.getOpcode() == ISD::TRUNCATE ||
- SetCC.getOpcode() == ISD::AND) {
+ SetCC.getOpcode() == ISD::TRUNCATE || SetCC.getOpcode() == ISD::AND) {
if (SetCC.getOpcode() == ISD::AND) {
int OpIdx = -1;
if (isOneConstant(SetCC.getOperand(0)))
@@ -49151,13 +49258,13 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
if (!FVal) {
SDValue Op = SetCC.getOperand(0);
// Skip 'zext' or 'trunc' node.
- if (Op.getOpcode() == ISD::ZERO_EXTEND ||
- Op.getOpcode() == ISD::TRUNCATE)
+ if (Op.getOpcode() == ISD::ZERO_EXTEND || Op.getOpcode() == ISD::TRUNCATE)
Op = Op.getOperand(0);
// A special case for rdrand/rdseed, where 0 is set if false cond is
// found.
if ((Op.getOpcode() != X86ISD::RDRAND &&
- Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
+ Op.getOpcode() != X86ISD::RDSEED) ||
+ Op.getResNo() != 0)
return SDValue();
}
// Quit if false value is not the constant 0 or 1.
@@ -49202,7 +49309,8 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
SDValue SetCC0, SetCC1;
switch (Cond->getOpcode()) {
- default: return false;
+ default:
+ return false;
case ISD::AND:
case X86ISD::AND:
isAnd = true;
@@ -49267,8 +49375,7 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
}
// If this is a check of the z flag of an add with 1, switch to the
// C flag.
- if (CarryCC == X86::COND_E &&
- CarryOp1.getOpcode() == X86ISD::ADD &&
+ if (CarryCC == X86::COND_E && CarryOp1.getOpcode() == X86ISD::ADD &&
isOneConstant(CarryOp1.getOperand(1)))
return CarryOp1;
} else if (FoundAndLSB) {
@@ -49801,12 +49908,11 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
// for any integer data type, including i8/i16.
- if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+ if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
Cond = getSETCC(CC, Cond, DL, DAG);
// Zero extend the condition if needed.
- Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
- FalseC->getValueType(0), Cond);
+ Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
SDValue(FalseC, 0));
return Cond;
@@ -49822,24 +49928,25 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
bool isFastMultiplier = false;
if (Diff.ult(10)) {
switch (Diff.getZExtValue()) {
- default: break;
- case 1: // result = add base, cond
- case 2: // result = lea base( , cond*2)
- case 3: // result = lea base(cond, cond*2)
- case 4: // result = lea base( , cond*4)
- case 5: // result = lea base(cond, cond*4)
- case 8: // result = lea base( , cond*8)
- case 9: // result = lea base(cond, cond*8)
+ default:
+ break;
+ case 1: // result = add base, cond
+ case 2: // result = lea base( , cond*2)
+ case 3: // result = lea base(cond, cond*2)
+ case 4: // result = lea base( , cond*4)
+ case 5: // result = lea base(cond, cond*4)
+ case 8: // result = lea base( , cond*8)
+ case 9: // result = lea base(cond, cond*8)
isFastMultiplier = true;
break;
}
}
if (isFastMultiplier) {
- Cond = getSETCC(CC, Cond, DL ,DAG);
+ Cond = getSETCC(CC, Cond, DL, DAG);
// Zero extend the condition if needed.
- Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
- Cond);
+ Cond =
+ DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
// Scale the condition by the difference.
if (Diff != 1)
Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
@@ -50630,8 +50737,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG,
// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
// since the result of setcc_c is all zero's or all ones.
- if (VT.isInteger() && !VT.isVector() &&
- N1C && N0.getOpcode() == ISD::AND &&
+ if (VT.isInteger() && !VT.isVector() && N1C && N0.getOpcode() == ISD::AND &&
N0.getOperand(1).getOpcode() == ISD::Constant) {
SDValue N00 = N0.getOperand(0);
APInt Mask = N0.getConstantOperandAPInt(1);
@@ -50715,7 +50821,7 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
if (SraConst.isNegative())
return SDValue();
- for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
+ for (MVT SVT : {MVT::i8, MVT::i16, MVT::i32}) {
unsigned ShiftSize = SVT.getSizeInBits();
// Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
@@ -51049,8 +51155,8 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
// truncate to create a larger truncate.
- if (Subtarget.hasAVX512() &&
- N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
+ if (Subtarget.hasAVX512() && N0.getOpcode() == ISD::TRUNCATE &&
+ N1.isUndef() && VT == MVT::v16i8 &&
N0.getOperand(0).getValueType() == MVT::v8i32) {
if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
(!IsSigned &&
@@ -51397,7 +51503,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
SDValue CMP00 = CMP0->getOperand(0);
SDValue CMP01 = CMP0->getOperand(1);
- EVT VT = CMP00.getValueType();
+ EVT VT = CMP00.getValueType();
if (VT == MVT::f32 || VT == MVT::f64 ||
(VT == MVT::f16 && Subtarget.hasFP16())) {
@@ -51423,8 +51529,10 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
}
if (!ExpectingFlags) {
- enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
- enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+ enum X86::CondCode cc0 =
+ (enum X86::CondCode)N0.getConstantOperandVal(0);
+ enum X86::CondCode cc1 =
+ (enum X86::CondCode)N1.getConstantOperandVal(0);
if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
X86::CondCode tmp = cc0;
@@ -51432,7 +51540,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
cc1 = tmp;
}
- if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
+ if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
// FIXME: need symbolic constants for these magic numbers.
// See X86ATTInstPrinter.cpp:printSSECC().
@@ -51442,7 +51550,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
DAG.getTargetConstant(x86cc, DL, MVT::i8));
// Need to fill with zeros to ensure the bitcast will produce zeroes
- // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+ // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee
+ // that.
SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
DAG.getConstant(0, DL, MVT::v16i1),
FSetCC, DAG.getVectorIdxConstant(0, DL));
@@ -51474,8 +51583,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
DAG.getConstant(1, DL, IntVT));
- SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
- ANDed);
+ SDValue OneBitOfTruth =
+ DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
return OneBitOfTruth;
}
}
@@ -51670,7 +51779,8 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
assert(VT.isVector() && "Expected vector type");
assert((N.getOpcode() == ISD::ANY_EXTEND ||
N.getOpcode() == ISD::ZERO_EXTEND ||
- N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+ N.getOpcode() == ISD::SIGN_EXTEND) &&
+ "Invalid Node");
SDValue Narrow = N.getOperand(0);
EVT NarrowVT = Narrow.getValueType();
@@ -51680,26 +51790,27 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
if (!Op)
return SDValue();
switch (N.getOpcode()) {
- default: llvm_unreachable("Unexpected opcode");
+ default:
+ llvm_unreachable("Unexpected opcode");
case ISD::ANY_EXTEND:
return Op;
case ISD::ZERO_EXTEND:
return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
case ISD::SIGN_EXTEND:
- return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
- Op, DAG.getValueType(NarrowVT));
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
+ DAG.getValueType(NarrowVT));
}
}
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
unsigned FPOpcode;
switch (Opcode) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("Unexpected input node for FP logic conversion");
case ISD::AND: FPOpcode = X86ISD::FAND; break;
case ISD::OR: FPOpcode = X86ISD::FOR; break;
case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
- // clang-format on
+ // clang-format on
}
return FPOpcode;
}
@@ -52142,8 +52253,7 @@ static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
DAG.getConstant(0, dl, SubVecVT));
Ops[0] = SubVec;
- SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
- Ops);
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, Ops);
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
}
@@ -52492,7 +52602,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
return R;
- if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
+ if (SDValue R = combineAndNotIntoANDNP(N, dl, DAG))
return R;
if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
@@ -53268,7 +53378,8 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (NotCond) {
SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
- R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
+ R = DAG.getNode(ISD::MUL, dl, VT, R,
+ DAG.getConstant(Val + 1, dl, VT));
R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
return R;
}
@@ -53405,7 +53516,7 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
return SDValue();
switch (VT.getSimpleVT().SimpleTy) {
- // clang-format off
+ // clang-format off
default: return SDValue();
case MVT::v16i8:
case MVT::v8i16:
@@ -53535,8 +53646,8 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
// split across two registers. We can use a packusdw+perm to clamp to 0-65535
// and concatenate at the same time. Then we can use a final vpmovuswb to
// clip to 0-255.
- if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
- InVT == MVT::v16i32 && VT == MVT::v16i8) {
+ if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && InVT == MVT::v16i32 &&
+ VT == MVT::v16i8) {
if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
@@ -53552,11 +53663,12 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
// FIXME: We could widen truncates to 512 to remove the VLX restriction.
// If the result type is 256-bits or larger and we have disable 512-bit
// registers, we should go ahead and use the pack instructions if possible.
- bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
- (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
- (InVT.getSizeInBits() > 128) &&
- (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
- !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
+ bool PreferAVX512 =
+ ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
+ (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
+ (InVT.getSizeInBits() > 128) &&
+ (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
+ !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
isPowerOf2_32(VT.getVectorNumElements()) &&
@@ -53569,8 +53681,8 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
DAG, Subtarget);
assert(Mid && "Failed to pack!");
- SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
- Subtarget);
+ SDValue V =
+ truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG, Subtarget);
assert(V && "Failed to pack!");
return V;
} else if (SVT == MVT::i8 || Subtarget.hasSSE41())
@@ -53894,10 +54006,9 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
CastVT = VT.changeVectorElementType(EltVT);
}
- SDValue Load =
- DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
- ML->getPointerInfo().getWithOffset(Offset),
- Alignment, ML->getMemOperand()->getFlags());
+ SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
+ ML->getPointerInfo().getWithOffset(Offset),
+ Alignment, ML->getMemOperand()->getFlags());
SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
@@ -53928,8 +54039,8 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
if (LoadFirstElt && LoadLastElt) {
SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
ML->getMemOperand());
- SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
- ML->getPassThru());
+ SDValue Blend =
+ DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getPassThru());
return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
}
@@ -53951,8 +54062,8 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
ML->getAddressingMode(), ML->getExtensionType());
- SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
- ML->getPassThru());
+ SDValue Blend =
+ DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getPassThru());
return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
}
@@ -54032,8 +54143,8 @@ static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
// Store that element at the appropriate offset from the base pointer.
return DAG.getStore(MS->getChain(), DL, Extract, Addr,
- MS->getPointerInfo().getWithOffset(Offset),
- Alignment, MS->getMemOperand()->getFlags());
+ MS->getPointerInfo().getWithOffset(Offset), Alignment,
+ MS->getMemOperand()->getFlags());
}
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
@@ -54230,15 +54341,16 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
// Turn vXi1 stores of constants into a scalar store.
if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
- VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
+ VT == MVT::v64i1) &&
+ VT == StVT && TLI.isTypeLegal(VT) &&
ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
// If its a v64i1 store without 64-bit support, we need two stores.
if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
- SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
- StoredVal->ops().slice(0, 32));
+ SDValue Lo =
+ DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(0, 32));
Lo = combinevXi1ConstantToInteger(Lo, DAG);
- SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
- StoredVal->ops().slice(32, 32));
+ SDValue Hi =
+ DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(32, 32));
Hi = combinevXi1ConstantToInteger(Hi, DAG);
SDValue Ptr0 = St->getBasePtr();
@@ -54338,9 +54450,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
StoredVal.hasOneUse() &&
TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
- return EmitTruncSStore(IsSigned, St->getChain(),
- dl, StoredVal.getOperand(0), St->getBasePtr(),
- VT, St->getMemOperand(), DAG);
+ return EmitTruncSStore(IsSigned, St->getChain(), dl,
+ StoredVal.getOperand(0), St->getBasePtr(), VT,
+ St->getMemOperand(), DAG);
}
// Try to fold a extract_element(VTRUNC) pattern into a truncating store.
@@ -54379,14 +54491,14 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
if (St->isTruncatingStore() && VT.isVector()) {
if (TLI.isTruncStoreLegal(VT, StVT)) {
if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
- return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
- dl, Val, St->getBasePtr(),
- St->getMemoryVT(), St->getMemOperand(), DAG);
- if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
- DAG, dl))
+ return EmitTruncSStore(true /* Signed saturation */, St->getChain(), dl,
+ Val, St->getBasePtr(), St->getMemoryVT(),
+ St->getMemOperand(), DAG);
+ if (SDValue Val =
+ detectUSatPattern(St->getValue(), St->getMemoryVT(), DAG, dl))
return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
- dl, Val, St->getBasePtr(),
- St->getMemoryVT(), St->getMemOperand(), DAG);
+ dl, Val, St->getBasePtr(), St->getMemoryVT(),
+ St->getMemOperand(), DAG);
}
return SDValue();
@@ -55194,8 +55306,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- const SDLoc &DL) {
+ const X86Subtarget &Subtarget, const SDLoc &DL) {
using namespace SDPatternMatch;
if (!VT.isVector() || !Subtarget.hasSSSE3())
return SDValue();
@@ -55269,8 +55380,8 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
std::swap(IdxN01, IdxN11);
}
// N0 indices be the even element. N1 indices must be the next odd element.
- if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
- IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
+ if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
+ IdxN11 != 2 * i + 1)
return SDValue();
SDValue N00In = N00Elt.getOperand(0);
SDValue N01In = N01Elt.getOperand(0);
@@ -55281,8 +55392,8 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
ZExtIn = N00In;
SExtIn = N01In;
}
- if (ZExtIn != N00In || SExtIn != N01In ||
- ZExtIn != N10In || SExtIn != N11In)
+ if (ZExtIn != N00In || SExtIn != N01In || ZExtIn != N10In ||
+ SExtIn != N11In)
return SDValue();
}
@@ -55302,14 +55413,13 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
// Shrink by adding truncate nodes and let DAGCombine fold with the
// sources.
EVT InVT = Ops[0].getValueType();
- assert(InVT.getScalarType() == MVT::i8 &&
- "Unexpected scalar element type");
+ assert(InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type");
assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
InVT.getVectorNumElements() / 2);
return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
};
- return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, {ZExtIn, SExtIn},
PMADDBuilder);
}
@@ -55494,7 +55604,7 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
bool NegRes) {
if (NegMul) {
switch (Opcode) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FNMADD; break;
case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
@@ -55508,13 +55618,13 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
- // clang-format on
+ // clang-format on
}
}
if (NegAcc) {
switch (Opcode) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FMSUB; break;
case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
@@ -55532,7 +55642,7 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
- // clang-format on
+ // clang-format on
}
}
@@ -55549,7 +55659,7 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
- // clang-format on
+ // clang-format on
}
}
@@ -55681,13 +55791,13 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
unsigned IntOpcode;
switch (N->getOpcode()) {
- // clang-format off
+ // clang-format off
default: llvm_unreachable("Unexpected FP logic op");
case X86ISD::FOR: IntOpcode = ISD::OR; break;
case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
case X86ISD::FAND: IntOpcode = ISD::AND; break;
case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
- // clang-format on
+ // clang-format on
}
SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
return DAG.getBitcast(VT, IntOp);
@@ -56039,13 +56149,18 @@ static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
// into FMINC and FMAXC, which are Commutative operations.
unsigned NewOp = 0;
switch (N->getOpcode()) {
- default: llvm_unreachable("unknown opcode");
- case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
- case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
+ default:
+ llvm_unreachable("unknown opcode");
+ case X86ISD::FMIN:
+ NewOp = X86ISD::FMINC;
+ break;
+ case X86ISD::FMAX:
+ NewOp = X86ISD::FMAXC;
+ break;
}
- return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
- N->getOperand(0), N->getOperand(1));
+ return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), N->getOperand(0),
+ N->getOperand(1));
}
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
@@ -56091,8 +56206,8 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
- EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
- VT);
+ EVT SetCCType =
+ TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
// There are 4 possibilities involving NaN inputs, and these are the required
// outputs:
@@ -56142,8 +56257,8 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
SDLoc dl(N);
- SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
- DAG.getBitcast(InVT, VZLoad));
+ SDValue Convert =
+ DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
DCI.CombineTo(N, Convert);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
DCI.recursivelyDeleteUnusedNodes(LN);
@@ -56638,8 +56753,8 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
// Only combine legal element types.
EVT SVT = VT.getVectorElementType();
- if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
- SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
+ if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 && SVT != MVT::i64 &&
+ SVT != MVT::f32 && SVT != MVT::f64)
return SDValue();
// We don't have CMPP Instruction for vxf16
@@ -56679,16 +56794,15 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
// (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
- if (!DCI.isBeforeLegalizeOps() &&
- N0.getOpcode() == X86ISD::SETCC_CARRY) {
+ if (!DCI.isBeforeLegalizeOps() && N0.getOpcode() == X86ISD::SETCC_CARRY) {
SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
- N0->getOperand(1));
+ N0->getOperand(1));
bool ReplaceOtherUses = !N0.hasOneUse();
DCI.CombineTo(N, Setcc);
// Replace other uses with a truncate of the widened setcc_carry.
if (ReplaceOtherUses) {
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
- N0.getValueType(), Setcc);
+ SDValue Trunc =
+ DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), Setcc);
DCI.CombineTo(N0.getNode(), Trunc);
}
@@ -56981,13 +57095,13 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
N0.getOpcode() == X86ISD::SETCC_CARRY) {
SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
- N0->getOperand(1));
+ N0->getOperand(1));
bool ReplaceOtherUses = !N0.hasOneUse();
DCI.CombineTo(N, Setcc);
// Replace other uses with a truncate of the widened setcc_carry.
if (ReplaceOtherUses) {
- SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
- N0.getValueType(), Setcc);
+ SDValue Trunc =
+ DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), Setcc);
DCI.CombineTo(N0.getNode(), Trunc);
}
@@ -57263,8 +57377,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
SDValue BaseOp = LHS.getOperand(0);
SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
- SDValue SETCC1 = DAG.getSetCC(
- DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
+ SDValue SETCC1 = DAG.getSetCC(DL, VT, BaseOp,
+ DAG.getConstant(-CInt, DL, OpVT), CC);
return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
SETCC0, SETCC1);
}
@@ -57624,19 +57738,25 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
SDLoc DL(GorS);
if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
- SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
- Gather->getMask(), Base, Index, Scale } ;
- return DAG.getMaskedGather(Gather->getVTList(),
- Gather->getMemoryVT(), DL, Ops,
- Gather->getMemOperand(),
+ SDValue Ops[] = {Gather->getChain(),
+ Gather->getPassThru(),
+ Gather->getMask(),
+ Base,
+ Index,
+ Scale};
+ return DAG.getMaskedGather(Gather->getVTList(), Gather->getMemoryVT(), DL,
+ Ops, Gather->getMemOperand(),
Gather->getIndexType(),
Gather->getExtensionType());
}
auto *Scatter = cast<MaskedScatterSDNode>(GorS);
- SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
- Scatter->getMask(), Base, Index, Scale };
- return DAG.getMaskedScatter(Scatter->getVTList(),
- Scatter->getMemoryVT(), DL,
+ SDValue Ops[] = {Scatter->getChain(),
+ Scatter->getValue(),
+ Scatter->getMask(),
+ Base,
+ Index,
+ Scale};
+ return DAG.getMaskedScatter(Scatter->getVTList(), Scatter->getMemoryVT(), DL,
Ops, Scatter->getMemOperand(),
Scatter->getIndexType(),
Scatter->isTruncatingStore());
@@ -57867,8 +57987,8 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
// The AND node needs bitcasts to/from an integer vector type around it.
SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
- SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
- MaskConst);
+ SDValue NewAnd =
+ DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0), MaskConst);
SDValue Res = DAG.getBitcast(VT, NewAnd);
if (IsStrict)
return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
@@ -58054,8 +58174,8 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
// use CVTSI2P.
assert(InVT == MVT::v2i64 && "Unexpected VT!");
SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
- SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
- { 0, 2, -1, -1 });
+ SDValue Shuf =
+ DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast, {0, 2, -1, -1});
if (IsStrict)
return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
{N->getOperand(0), Shuf});
@@ -58156,7 +58276,7 @@ static bool needCarryOrOverflowFlag(SDValue Flags) {
}
switch (CC) {
- // clang-format off
+ // clang-format off
default: break;
case X86::COND_A: case X86::COND_AE:
case X86::COND_B: case X86::COND_BE:
@@ -58164,7 +58284,7 @@ static bool needCarryOrOverflowFlag(SDValue Flags) {
case X86::COND_G: case X86::COND_GE:
case X86::COND_L: case X86::COND_LE:
return true;
- // clang-format on
+ // clang-format on
}
}
@@ -58300,11 +58420,12 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG,
// After this the truncate and arithmetic op must have a single use.
if (!Trunc.hasOneUse() || !Op.hasOneUse())
- return SDValue();
+ return SDValue();
unsigned NewOpc;
switch (Op.getOpcode()) {
- default: return SDValue();
+ default:
+ return SDValue();
case ISD::AND:
// Skip and with constant. We have special handling for and with immediate
// during isel to generate test instructions.
@@ -58312,8 +58433,12 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG,
return SDValue();
NewOpc = X86ISD::AND;
break;
- case ISD::OR: NewOpc = X86ISD::OR; break;
- case ISD::XOR: NewOpc = X86ISD::XOR; break;
+ case ISD::OR:
+ NewOpc = X86ISD::OR;
+ break;
+ case ISD::XOR:
+ NewOpc = X86ISD::XOR;
+ break;
case ISD::ADD:
// If the carry or overflow flag is used, we can't truncate.
if (needCarryOrOverflowFlag(SDValue(N, 0)))
@@ -58490,9 +58615,8 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
- const SDLoc &DL, EVT VT,
- const X86Subtarget &Subtarget) {
+static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL,
+ EVT VT, const X86Subtarget &Subtarget) {
using namespace SDPatternMatch;
// Example of pattern we try to detect:
@@ -58600,9 +58724,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
// Attempt to turn this pattern into PMADDWD.
// (add (mul (sext (build_vector)), (sext (build_vector))),
// (mul (sext (build_vector)), (sext (build_vector)))
-static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N,
- const SDLoc &DL, EVT VT,
- const X86Subtarget &Subtarget) {
+static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL,
+ EVT VT, const X86Subtarget &Subtarget) {
using namespace SDPatternMatch;
if (!Subtarget.hasSSE2())
@@ -58698,7 +58821,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N,
// If the output is narrower than an input, extract the low part of the input
// vector.
EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
- VT.getVectorNumElements() * 2);
+ VT.getVectorNumElements() * 2);
if (OutVT16.bitsLT(In0.getValueType())) {
In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
DAG.getVectorIdxConstant(0, DL));
@@ -58707,8 +58830,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N,
In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
DAG.getVectorIdxConstant(0, DL));
}
- return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
- PMADDBuilder);
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, {In0, In1}, PMADDBuilder);
}
// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
@@ -59677,8 +59799,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
unsigned Imm1 = Ops[1].getConstantOperandVal(2);
// TODO: Handle zero'd subvectors.
if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
- int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
- (int)((Imm1 >> 4) & 0x3)};
+ int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3),
+ (int)(Imm1 & 0x03), (int)((Imm1 >> 4) & 0x3)};
MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
Ops[0].getOperand(1), DAG, DL);
@@ -59866,8 +59988,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
break;
}
- ISD::CondCode ICC =
- Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
+ ISD::CondCode ICC = Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
ISD::CondCode FCC =
Opcode == X86ISD::PCMPEQ ? ISD::SETOEQ : ISD::SETOGT;
@@ -60217,7 +60338,8 @@ static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
APInt Constant = APInt::getZero(VT.getSizeInBits());
for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
- if (!C) break;
+ if (!C)
+ break;
Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
if (I == (E - 1)) {
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
@@ -60290,9 +60412,9 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
SubVecVT.getFixedSizeInBits())
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
- getZeroVector(OpVT, Subtarget, DAG, dl),
- Ins.getOperand(1), N->getOperand(2));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+ getZeroVector(OpVT, Subtarget, DAG, dl),
+ Ins.getOperand(1), N->getOperand(2));
}
}
@@ -60982,7 +61104,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
LHS.getOperand(0).getValueType() == MVT::v4i32) {
SDLoc dl(N);
LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
- LHS.getOperand(0), { 0, -1, 1, -1 });
+ LHS.getOperand(0), {0, -1, 1, -1});
LHS = DAG.getBitcast(MVT::v2i64, LHS);
return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
}
@@ -60992,7 +61114,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
RHS.getOperand(0).getValueType() == MVT::v4i32) {
SDLoc dl(N);
RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
- RHS.getOperand(0), { 0, -1, 1, -1 });
+ RHS.getOperand(0), {0, -1, 1, -1});
RHS = DAG.getBitcast(MVT::v2i64, RHS);
return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
}
@@ -61263,16 +61385,16 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
// Widen to at least 8 input elements.
if (NumElts < 8) {
unsigned NumConcats = 8 / NumElts;
- SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
- : DAG.getConstant(0, dl, IntVT);
+ SDValue Fill =
+ NumElts == 4 ? DAG.getUNDEF(IntVT) : DAG.getConstant(0, dl, IntVT);
SmallVector<SDValue, 4> Ops(NumConcats, Fill);
Ops[0] = Src;
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
}
// Destination is vXf32 with at least 4 elements.
- EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
- std::max(4U, NumElts));
+ EVT CvtVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::f32, std::max(4U, NumElts));
SDValue Cvt, Chain;
if (IsStrict) {
Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
@@ -61542,7 +61664,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
- // clang-format off
+ // clang-format off
default: break;
case ISD::SCALAR_TO_VECTOR:
return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
@@ -61893,7 +62015,8 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
bool Commute = false;
switch (Op.getOpcode()) {
- default: return false;
+ default:
+ return false;
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND:
@@ -61933,8 +62056,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
((Commute && !isa<ConstantSDNode>(N1)) ||
(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
return false;
- if (IsFoldableAtomicRMW(N0, Op) ||
- (Commute && IsFoldableAtomicRMW(N1, Op)))
+ if (IsFoldableAtomicRMW(N0, Op) || (Commute && IsFoldableAtomicRMW(N1, Op)))
return false;
}
}
@@ -62021,8 +62143,7 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
default:
break;
}
- }
- else if (Constraint.size() == 2) {
+ } else if (Constraint.size() == 2) {
switch (Constraint[0]) {
default:
break;
@@ -62211,8 +62332,7 @@ X86TargetLowering::getSingleConstraintMatchWeight(
/// Try to replace an X constraint, which matches anything, with another that
/// has more specific requirements based on the type of the corresponding
/// operand.
-const char *X86TargetLowering::
-LowerXConstraint(EVT ConstraintVT) const {
+const char *X86TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
// FP X constraints get lowered to SSE1/2 registers if available, otherwise
// 'f' like normal targets.
if (ConstraintVT.isFloatingPoint()) {
@@ -62258,7 +62378,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
SDValue Result;
char ConstraintLetter = Constraint[0];
switch (ConstraintLetter) {
- default: break;
+ default:
+ break;
case 'I':
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->getZExtValue() <= 31) {
@@ -62332,8 +62453,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
break;
}
- // FIXME gcc accepts some relocatable values here too, but only in certain
- // memory models; it's complicated.
+ // FIXME gcc accepts some relocatable values here too, but only in certain
+ // memory models; it's complicated.
}
return;
}
@@ -62376,8 +62497,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
BooleanContent BCont = getBooleanContents(MVT::i64);
- ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
- : ISD::SIGN_EXTEND;
+ ISD::NodeType ExtOpc =
+ IsBool ? getExtendForContent(BCont) : ISD::SIGN_EXTEND;
int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
: CST->getSExtValue();
Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
@@ -62456,7 +62577,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (Constraint.size() == 1) {
// GCC Constraint Letters
switch (Constraint[0]) {
- default: break;
+ default:
+ break;
// 'A' means [ER]AX + [ER]DX.
case 'A':
if (Subtarget.is64Bit())
@@ -62484,7 +62606,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &X86::VK64RegClass);
}
break;
- case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+ case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
if (Subtarget.is64Bit()) {
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
@@ -62506,7 +62628,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
}
[[fallthrough]];
// 32-bit fallthrough
- case 'Q': // Q_REGS
+ case 'Q': // Q_REGS
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
if (VT == MVT::i16)
@@ -62517,8 +62639,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (VT != MVT::f80 && !VT.isVector())
return std::make_pair(0U, &X86::GR64_ABCDRegClass);
break;
- case 'r': // GENERAL_REGS
- case 'l': // INDEX_REGS
+ case 'r': // GENERAL_REGS
+ case 'l': // INDEX_REGS
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
? &X86::GR8RegClass
@@ -62537,7 +62659,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
? &X86::GR64RegClass
: &X86::GR64_NOREX2RegClass);
break;
- case 'R': // LEGACY_REGS
+ case 'R': // LEGACY_REGS
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8_NOREXRegClass);
if (VT == MVT::i16)
@@ -62548,7 +62670,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (VT != MVT::f80 && !VT.isVector())
return std::make_pair(0U, &X86::GR64_NOREXRegClass);
break;
- case 'f': // FP Stack registers.
+ case 'f': // FP Stack registers.
// If SSE is enabled for this VT, use f80 to ensure the isel moves the
// value to the correct fpstack register class.
if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
@@ -62558,16 +62680,19 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
return std::make_pair(0U, &X86::RFP80RegClass);
break;
- case 'y': // MMX_REGS if MMX allowed.
- if (!Subtarget.hasMMX()) break;
+ case 'y': // MMX_REGS if MMX allowed.
+ if (!Subtarget.hasMMX())
+ break;
return std::make_pair(0U, &X86::VR64RegClass);
case 'v':
- case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
- if (!Subtarget.hasSSE1()) break;
+ case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+ if (!Subtarget.hasSSE1())
+ break;
bool VConstraint = (Constraint[0] == 'v');
switch (VT.SimpleTy) {
- default: break;
+ default:
+ break;
// Scalar SSE types.
case MVT::f16:
if (VConstraint && Subtarget.hasFP16())
@@ -62658,7 +62783,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case MVT::v16f32:
case MVT::v16i32:
case MVT::v8i64:
- if (!Subtarget.hasAVX512()) break;
+ if (!Subtarget.hasAVX512())
+ break;
if (VConstraint)
return std::make_pair(0U, &X86::VR512RegClass);
return std::make_pair(0U, &X86::VR512_0_15RegClass);
@@ -62674,12 +62800,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case '2':
return getRegForInlineAsmConstraint(TRI, "x", VT);
case 'm':
- if (!Subtarget.hasMMX()) break;
+ if (!Subtarget.hasMMX())
+ break;
return std::make_pair(0U, &X86::VR64RegClass);
case 'z':
- if (!Subtarget.hasSSE1()) break;
+ if (!Subtarget.hasSSE1())
+ break;
switch (VT.SimpleTy) {
- default: break;
+ default:
+ break;
// Scalar SSE types.
case MVT::f16:
if (!Subtarget.hasFP16())
@@ -62794,14 +62923,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
- std::pair<Register, const TargetRegisterClass*> Res;
+ std::pair<Register, const TargetRegisterClass *> Res;
Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
if (!Res.second) {
// Only match x87 registers if the VT is one SelectionDAGBuilder can convert
// to/from f80.
- if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
+ if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 ||
+ VT == MVT::f80) {
// Map st(0) -> st(7) -> ST0
if (Constraint.size() == 7 && Constraint[0] == '{' &&
tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
@@ -62859,7 +62989,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// turn into {ax},{dx}.
// MVT::Other is used to specify clobber names.
if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
- return Res; // Correct type already, nothing to do.
+ return Res; // Correct type already, nothing to do.
// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
// return "eax". This should even work for things like getting 64bit integer
@@ -62871,7 +63001,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// Therefore, use a helper method.
if (isGRClass(*Class)) {
unsigned Size = VT.getSizeInBits();
- if (Size == 1) Size = 8;
+ if (Size == 1)
+ Size = 8;
if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
return std::make_pair(0, nullptr);
Register DestReg = getX86SubSuperRegister(Res.first, Size);
@@ -62879,9 +63010,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
bool is64Bit = Subtarget.is64Bit();
const TargetRegisterClass *RC =
Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
- : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
- : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
- : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
+ : Size == 16
+ ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
+ : Size == 32
+ ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
+ : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
if (Size == 64 && !is64Bit) {
// Model GCC's behavior here and select a fixed pair of 32-bit
// registers.
@@ -63133,8 +63266,7 @@ X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {
return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
}
-unsigned
-X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
+unsigned X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
// The default stack probe size is 4096 if the function has no stackprobesize
// attribute.
return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index d759895719388..df3838fab4ae9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -18,1975 +18,1964 @@
#include "llvm/CodeGen/TargetLowering.h"
namespace llvm {
- class X86Subtarget;
- class X86TargetMachine;
-
- namespace X86ISD {
- // X86 Specific DAG Nodes
- enum NodeType : unsigned {
- // Start the numbering where the builtin ops leave off.
- FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
- /// Bit scan forward.
- BSF,
- /// Bit scan reverse.
- BSR,
-
- /// X86 funnel/double shift i16 instructions. These correspond to
- /// X86::SHLDW and X86::SHRDW instructions which have different amt
- /// modulo rules to generic funnel shifts.
- /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
- FSHL,
- FSHR,
-
- /// Bitwise logical AND of floating point values. This corresponds
- /// to X86::ANDPS or X86::ANDPD.
- FAND,
-
- /// Bitwise logical OR of floating point values. This corresponds
- /// to X86::ORPS or X86::ORPD.
- FOR,
-
- /// Bitwise logical XOR of floating point values. This corresponds
- /// to X86::XORPS or X86::XORPD.
- FXOR,
-
- /// Bitwise logical ANDNOT of floating point values. This
- /// corresponds to X86::ANDNPS or X86::ANDNPD.
- FANDN,
-
- /// These operations represent an abstract X86 call
- /// instruction, which includes a bunch of information. In particular the
- /// operands of these node are:
- ///
- /// #0 - The incoming token chain
- /// #1 - The callee
- /// #2 - The number of arg bytes the caller pushes on the stack.
- /// #3 - The number of arg bytes the callee pops off the stack.
- /// #4 - The value to pass in AL/AX/EAX (optional)
- /// #5 - The value to pass in DL/DX/EDX (optional)
- ///
- /// The result values of these nodes are:
- ///
- /// #0 - The outgoing token chain
- /// #1 - The first register result value (optional)
- /// #2 - The second register result value (optional)
- ///
- CALL,
-
- /// Same as call except it adds the NoTrack prefix.
- NT_CALL,
-
- // Pseudo for a OBJC call that gets emitted together with a special
- // marker instruction.
- CALL_RVMARKER,
-
- /// The same as ISD::CopyFromReg except that this node makes it explicit
- /// that it may lower to an x87 FPU stack pop. Optimizations should be more
- /// cautious when handling this node than a normal CopyFromReg to avoid
- /// removing a required FPU stack pop. A key requirement is optimizations
- /// should not optimize any users of a chain that contains a
- /// POP_FROM_X87_REG to use a chain from a point earlier than the
- /// POP_FROM_X87_REG (which may remove a required FPU stack pop).
- POP_FROM_X87_REG,
-
- // Pseudo for a call to an imported function to ensure the correct machine
- // instruction is emitted for Import Call Optimization.
- IMP_CALL,
-
- /// X86 compare and logical compare instructions.
- CMP,
- FCMP,
- COMI,
- UCOMI,
-
- // X86 compare with Intrinsics similar to COMI.
- COMX,
- UCOMX,
-
- /// X86 bit-test instructions.
- BT,
-
- /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
- /// operand, usually produced by a CMP instruction.
- SETCC,
-
- /// X86 Select
- SELECTS,
-
- /// X86 Constant-time Select, implemented with CMOV instruction. This is
- /// used to implement constant-time select.
- CTSELECT,
-
- // Same as SETCC except it's materialized with a sbb and the value is all
- // one's or all zero's.
- SETCC_CARRY, // R = carry_bit ? ~0 : 0
-
- /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
- /// Operands are two FP values to compare; result is a mask of
- /// 0s or 1s. Generally DTRT for C/C++ with NaNs.
- FSETCC,
-
- /// X86 FP SETCC, similar to above, but with output as an i1 mask and
- /// and a version with SAE.
- FSETCCM,
- FSETCCM_SAE,
-
- /// X86 conditional moves. Operand 0 and operand 1 are the two values
- /// to select from. Operand 2 is the condition code, and operand 3 is the
- /// flag operand produced by a CMP or TEST instruction.
- CMOV,
-
- /// X86 conditional branches. Operand 0 is the chain operand, operand 1
- /// is the block to branch if condition is true, operand 2 is the
- /// condition code, and operand 3 is the flag operand produced by a CMP
- /// or TEST instruction.
- BRCOND,
-
- /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
- /// operand 1 is the target address.
- NT_BRIND,
-
- /// Return with a glue operand. Operand 0 is the chain operand, operand
- /// 1 is the number of bytes of stack to pop.
- RET_GLUE,
-
- /// Return from interrupt. Operand 0 is the number of bytes to pop.
- IRET,
-
- /// Repeat fill, corresponds to X86::REP_STOSx.
- REP_STOS,
-
- /// Repeat move, corresponds to X86::REP_MOVSx.
- REP_MOVS,
-
- /// On Darwin, this node represents the result of the popl
- /// at function entry, used for PIC code.
- GlobalBaseReg,
-
- /// A wrapper node for TargetConstantPool, TargetJumpTable,
- /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
- /// MCSymbol and TargetBlockAddress.
- Wrapper,
-
- /// Special wrapper used under X86-64 PIC mode for RIP
- /// relative displacements.
- WrapperRIP,
-
- /// Copies a 64-bit value from an MMX vector to the low word
- /// of an XMM vector, with the high word zero filled.
- MOVQ2DQ,
-
- /// Copies a 64-bit value from the low word of an XMM vector
- /// to an MMX vector.
- MOVDQ2Q,
-
- /// Copies a 32-bit value from the low word of a MMX
- /// vector to a GPR.
- MMX_MOVD2W,
-
- /// Copies a GPR into the low 32-bit word of a MMX vector
- /// and zero out the high word.
- MMX_MOVW2D,
-
- /// Extract an 8-bit value from a vector and zero extend it to
- /// i32, corresponds to X86::PEXTRB.
- PEXTRB,
-
- /// Extract a 16-bit value from a vector and zero extend it to
- /// i32, corresponds to X86::PEXTRW.
- PEXTRW,
-
- /// Insert any element of a 4 x float vector into any element
- /// of a destination 4 x floatvector.
- INSERTPS,
-
- /// Insert the lower 8-bits of a 32-bit value to a vector,
- /// corresponds to X86::PINSRB.
- PINSRB,
-
- /// Insert the lower 16-bits of a 32-bit value to a vector,
- /// corresponds to X86::PINSRW.
- PINSRW,
-
- /// Shuffle 16 8-bit values within a vector.
- PSHUFB,
-
- /// Compute Sum of Absolute Differences.
- PSADBW,
- /// Compute Double Block Packed Sum-Absolute-Differences
- DBPSADBW,
-
- /// Bitwise Logical AND NOT of Packed FP values.
- ANDNP,
-
- /// Blend where the selector is an immediate.
- BLENDI,
-
- /// Dynamic (non-constant condition) vector blend where only the sign bits
- /// of the condition elements are used. This is used to enforce that the
- /// condition mask is not valid for generic VSELECT optimizations. This
- /// is also used to implement the intrinsics.
- /// Operands are in VSELECT order: MASK, TRUE, FALSE
- BLENDV,
-
- /// Combined add and sub on an FP vector.
- ADDSUB,
-
- // FP vector ops with rounding mode.
- FADD_RND,
- FADDS,
- FADDS_RND,
- FSUB_RND,
- FSUBS,
- FSUBS_RND,
- FMUL_RND,
- FMULS,
- FMULS_RND,
- FDIV_RND,
- FDIVS,
- FDIVS_RND,
- FMAX_SAE,
- FMAXS_SAE,
- FMIN_SAE,
- FMINS_SAE,
- FSQRT_RND,
- FSQRTS,
- FSQRTS_RND,
-
- // FP vector get exponent.
- FGETEXP,
- FGETEXP_SAE,
- FGETEXPS,
- FGETEXPS_SAE,
- // Extract Normalized Mantissas.
- VGETMANT,
- VGETMANT_SAE,
- VGETMANTS,
- VGETMANTS_SAE,
- // FP Scale.
- SCALEF,
- SCALEF_RND,
- SCALEFS,
- SCALEFS_RND,
-
- /// Integer horizontal add/sub.
- HADD,
- HSUB,
-
- /// Floating point horizontal add/sub.
- FHADD,
- FHSUB,
-
- // Detect Conflicts Within a Vector
- CONFLICT,
-
- /// Floating point max and min.
- FMAX,
- FMIN,
-
- /// Commutative FMIN and FMAX.
- FMAXC,
- FMINC,
-
- /// Scalar intrinsic floating point max and min.
- FMAXS,
- FMINS,
-
- /// Floating point reciprocal-sqrt and reciprocal approximation.
- /// Note that these typically require refinement
- /// in order to obtain suitable precision.
- FRSQRT,
- FRCP,
-
- // AVX-512 reciprocal approximations with a little more precision.
- RSQRT14,
- RSQRT14S,
- RCP14,
- RCP14S,
-
- // Thread Local Storage.
- TLSADDR,
-
- // Thread Local Storage. A call to get the start address
- // of the TLS block for the current module.
- TLSBASEADDR,
-
- // Thread Local Storage. When calling to an OS provided
- // thunk at the address from an earlier relocation.
- TLSCALL,
-
- // Thread Local Storage. A descriptor containing pointer to
- // code and to argument to get the TLS offset for the symbol.
- TLSDESC,
-
- // Exception Handling helpers.
- EH_RETURN,
-
- // SjLj exception handling setjmp.
- EH_SJLJ_SETJMP,
-
- // SjLj exception handling longjmp.
- EH_SJLJ_LONGJMP,
-
- // SjLj exception handling dispatch.
- EH_SJLJ_SETUP_DISPATCH,
-
- /// Tail call return. See X86TargetLowering::LowerCall for
- /// the list of operands.
- TC_RETURN,
-
- // Vector move to low scalar and zero higher vector elements.
- VZEXT_MOVL,
-
- // Vector integer truncate.
- VTRUNC,
- // Vector integer truncate with unsigned/signed saturation.
- VTRUNCUS,
- VTRUNCS,
-
- // Masked version of the above. Used when less than a 128-bit result is
- // produced since the mask only applies to the lower elements and can't
- // be represented by a select.
- // SRC, PASSTHRU, MASK
- VMTRUNC,
- VMTRUNCUS,
- VMTRUNCS,
-
- // Vector FP extend.
- VFPEXT,
- VFPEXT_SAE,
- VFPEXTS,
- VFPEXTS_SAE,
-
- // Vector FP round.
- VFPROUND,
- // Convert TWO packed single data to one packed data
- VFPROUND2,
- VFPROUND2_RND,
- VFPROUND_RND,
- VFPROUNDS,
- VFPROUNDS_RND,
-
- // Masked version of above. Used for v2f64->v4f32.
- // SRC, PASSTHRU, MASK
- VMFPROUND,
-
- // 128-bit vector logical left / right shift
- VSHLDQ,
- VSRLDQ,
-
- // Vector shift elements
- VSHL,
- VSRL,
- VSRA,
-
- // Vector variable shift
- VSHLV,
- VSRLV,
- VSRAV,
-
- // Vector shift elements by immediate
- VSHLI,
- VSRLI,
- VSRAI,
-
- // Shifts of mask registers.
- KSHIFTL,
- KSHIFTR,
-
- // Bit rotate by immediate
- VROTLI,
- VROTRI,
-
- // Vector packed double/float comparison.
- CMPP,
-
- // Vector integer comparisons.
- PCMPEQ,
- PCMPGT,
-
- // v8i16 Horizontal minimum and position.
- PHMINPOS,
-
- MULTISHIFT,
-
- /// Vector comparison generating mask bits for fp and
- /// integer signed and unsigned data types.
- CMPM,
- // Vector mask comparison generating mask bits for FP values.
- CMPMM,
- // Vector mask comparison with SAE for FP values.
- CMPMM_SAE,
-
- // Arithmetic operations with FLAGS results.
- ADD,
- SUB,
- ADC,
- SBB,
- SMUL,
- UMUL,
- OR,
- XOR,
- AND,
-
- // Bit field extract.
- BEXTR,
- BEXTRI,
-
- // Zero High Bits Starting with Specified Bit Position.
- BZHI,
-
- // Parallel extract and deposit.
- PDEP,
- PEXT,
-
- // X86-specific multiply by immediate.
- MUL_IMM,
-
- // Vector sign bit extraction.
- MOVMSK,
-
- // Vector bitwise comparisons.
- PTEST,
-
- // Vector packed fp sign bitwise comparisons.
- TESTP,
-
- // OR/AND test for masks.
- KORTEST,
- KTEST,
-
- // ADD for masks.
- KADD,
-
- // Several flavors of instructions with vector shuffle behaviors.
- // Saturated signed/unnsigned packing.
- PACKSS,
- PACKUS,
- // Intra-lane alignr.
- PALIGNR,
- // AVX512 inter-lane alignr.
- VALIGN,
- PSHUFD,
- PSHUFHW,
- PSHUFLW,
- SHUFP,
- // VBMI2 Concat & Shift.
- VSHLD,
- VSHRD,
-
- // Shuffle Packed Values at 128-bit granularity.
- SHUF128,
- MOVDDUP,
- MOVSHDUP,
- MOVSLDUP,
- MOVLHPS,
- MOVHLPS,
- MOVSD,
- MOVSS,
- MOVSH,
- UNPCKL,
- UNPCKH,
- VPERMILPV,
- VPERMILPI,
- VPERMI,
- VPERM2X128,
-
- // Variable Permute (VPERM).
- // Res = VPERMV MaskV, V0
- VPERMV,
-
- // 3-op Variable Permute (VPERMT2).
- // Res = VPERMV3 V0, MaskV, V1
- VPERMV3,
-
- // Bitwise ternary logic.
- VPTERNLOG,
- // Fix Up Special Packed Float32/64 values.
- VFIXUPIMM,
- VFIXUPIMM_SAE,
- VFIXUPIMMS,
- VFIXUPIMMS_SAE,
- // Range Restriction Calculation For Packed Pairs of Float32/64 values.
- VRANGE,
- VRANGE_SAE,
- VRANGES,
- VRANGES_SAE,
- // Reduce - Perform Reduction Transformation on scalar\packed FP.
- VREDUCE,
- VREDUCE_SAE,
- VREDUCES,
- VREDUCES_SAE,
- // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
- // Also used by the legacy (V)ROUND intrinsics where we mask out the
- // scaling part of the immediate.
- VRNDSCALE,
- VRNDSCALE_SAE,
- VRNDSCALES,
- VRNDSCALES_SAE,
- // Tests Types Of a FP Values for packed types.
- VFPCLASS,
- // Tests Types Of a FP Values for scalar types.
- VFPCLASSS,
-
- // Broadcast (splat) scalar or element 0 of a vector. If the operand is
- // a vector, this node may change the vector length as part of the splat.
- VBROADCAST,
- // Broadcast mask to vector.
- VBROADCASTM,
-
- /// SSE4A Extraction and Insertion.
- EXTRQI,
- INSERTQI,
-
- // XOP arithmetic/logical shifts.
- VPSHA,
- VPSHL,
- // XOP signed/unsigned integer comparisons.
- VPCOM,
- VPCOMU,
- // XOP packed permute bytes.
- VPPERM,
- // XOP two source permutation.
- VPERMIL2,
-
- // Vector multiply packed unsigned doubleword integers.
- PMULUDQ,
- // Vector multiply packed signed doubleword integers.
- PMULDQ,
- // Vector Multiply Packed UnsignedIntegers with Round and Scale.
- MULHRS,
-
- // Multiply and Add Packed Integers.
- VPMADDUBSW,
- VPMADDWD,
-
- // AVX512IFMA multiply and add.
- // NOTE: These are different than the instruction and perform
- // op0 x op1 + op2.
- VPMADD52L,
- VPMADD52H,
-
- // VNNI
- VPDPBUSD,
- VPDPBUSDS,
- VPDPWSSD,
- VPDPWSSDS,
-
- // FMA nodes.
- // We use the target independent ISD::FMA for the non-inverted case.
- FNMADD,
- FMSUB,
- FNMSUB,
- FMADDSUB,
- FMSUBADD,
-
- // FMA with rounding mode.
- FMADD_RND,
- FNMADD_RND,
- FMSUB_RND,
- FNMSUB_RND,
- FMADDSUB_RND,
- FMSUBADD_RND,
-
- // AVX512-FP16 complex addition and multiplication.
- VFMADDC,
- VFMADDC_RND,
- VFCMADDC,
- VFCMADDC_RND,
-
- VFMULC,
- VFMULC_RND,
- VFCMULC,
- VFCMULC_RND,
-
- VFMADDCSH,
- VFMADDCSH_RND,
- VFCMADDCSH,
- VFCMADDCSH_RND,
-
- VFMULCSH,
- VFMULCSH_RND,
- VFCMULCSH,
- VFCMULCSH_RND,
-
- VPDPBSUD,
- VPDPBSUDS,
- VPDPBUUD,
- VPDPBUUDS,
- VPDPBSSD,
- VPDPBSSDS,
-
- VPDPWSUD,
- VPDPWSUDS,
- VPDPWUSD,
- VPDPWUSDS,
- VPDPWUUD,
- VPDPWUUDS,
-
- VMINMAX,
- VMINMAX_SAE,
- VMINMAXS,
- VMINMAXS_SAE,
-
- CVTP2IBS,
- CVTP2IUBS,
- CVTP2IBS_RND,
- CVTP2IUBS_RND,
- CVTTP2IBS,
- CVTTP2IUBS,
- CVTTP2IBS_SAE,
- CVTTP2IUBS_SAE,
-
- MPSADBW,
-
- VCVT2PH2BF8,
- VCVT2PH2BF8S,
- VCVT2PH2HF8,
- VCVT2PH2HF8S,
- VCVTBIASPH2BF8,
- VCVTBIASPH2BF8S,
- VCVTBIASPH2HF8,
- VCVTBIASPH2HF8S,
- VCVTPH2BF8,
- VCVTPH2BF8S,
- VCVTPH2HF8,
- VCVTPH2HF8S,
- VMCVTBIASPH2BF8,
- VMCVTBIASPH2BF8S,
- VMCVTBIASPH2HF8,
- VMCVTBIASPH2HF8S,
- VMCVTPH2BF8,
- VMCVTPH2BF8S,
- VMCVTPH2HF8,
- VMCVTPH2HF8S,
- VCVTHF82PH,
-
- // Compress and expand.
- COMPRESS,
- EXPAND,
-
- // Bits shuffle
- VPSHUFBITQMB,
-
- // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
- SINT_TO_FP_RND,
- UINT_TO_FP_RND,
- SCALAR_SINT_TO_FP,
- SCALAR_UINT_TO_FP,
- SCALAR_SINT_TO_FP_RND,
- SCALAR_UINT_TO_FP_RND,
-
- // Vector float/double to signed/unsigned integer.
- CVTP2SI,
- CVTP2UI,
- CVTP2SI_RND,
- CVTP2UI_RND,
- // Scalar float/double to signed/unsigned integer.
- CVTS2SI,
- CVTS2UI,
- CVTS2SI_RND,
- CVTS2UI_RND,
-
- // Vector float/double to signed/unsigned integer with truncation.
- CVTTP2SI,
- CVTTP2UI,
- CVTTP2SI_SAE,
- CVTTP2UI_SAE,
-
- // Saturation enabled Vector float/double to signed/unsigned
- // integer with truncation.
- CVTTP2SIS,
- CVTTP2UIS,
- CVTTP2SIS_SAE,
- CVTTP2UIS_SAE,
- // Masked versions of above. Used for v2f64 to v4i32.
- // SRC, PASSTHRU, MASK
- MCVTTP2SIS,
- MCVTTP2UIS,
-
- // Scalar float/double to signed/unsigned integer with truncation.
- CVTTS2SI,
- CVTTS2UI,
- CVTTS2SI_SAE,
- CVTTS2UI_SAE,
-
- // Vector signed/unsigned integer to float/double.
- CVTSI2P,
- CVTUI2P,
-
- // Scalar float/double to signed/unsigned integer with saturation.
- CVTTS2SIS,
- CVTTS2UIS,
- CVTTS2SIS_SAE,
- CVTTS2UIS_SAE,
-
- // Masked versions of above. Used for v2f64->v4f32.
- // SRC, PASSTHRU, MASK
- MCVTP2SI,
- MCVTP2UI,
- MCVTTP2SI,
- MCVTTP2UI,
- MCVTSI2P,
- MCVTUI2P,
-
- // Custom handling for FP_TO_xINT_SAT
- FP_TO_SINT_SAT,
- FP_TO_UINT_SAT,
-
- // Vector float to bfloat16.
- // Convert packed single data to packed BF16 data
- CVTNEPS2BF16,
- // Masked version of above.
- // SRC, PASSTHRU, MASK
- MCVTNEPS2BF16,
-
- // Dot product of BF16/FP16 pairs to accumulated into
- // packed single precision.
- DPBF16PS,
- DPFP16PS,
-
- // A stack checking function call. On Windows it's _chkstk call.
- DYN_ALLOCA,
-
- // For allocating variable amounts of stack space when using
- // segmented stacks. Check if the current stacklet has enough space, and
- // falls back to heap allocation if not.
- SEG_ALLOCA,
-
- // For allocating stack space when using stack clash protector.
- // Allocation is performed by block, and each block is probed.
- PROBED_ALLOCA,
-
- // Memory barriers.
- MFENCE,
-
- // Get a random integer and indicate whether it is valid in CF.
- RDRAND,
-
- // Get a NIST SP800-90B & C compliant random integer and
- // indicate whether it is valid in CF.
- RDSEED,
-
- // Protection keys
- // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
- // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
- // value for ECX.
- RDPKRU,
- WRPKRU,
-
- // SSE42 string comparisons.
- // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
- // will emit one or two instructions based on which results are used. If
- // flags and index/mask this allows us to use a single instruction since
- // we won't have to pick and opcode for flags. Instead we can rely on the
- // DAG to CSE everything and decide at isel.
- PCMPISTR,
- PCMPESTR,
-
- // Test if in transactional execution.
- XTEST,
-
- // Conversions between float and half-float.
- CVTPS2PH,
- CVTPS2PH_SAE,
- CVTPH2PS,
- CVTPH2PS_SAE,
-
- // Masked version of above.
- // SRC, RND, PASSTHRU, MASK
- MCVTPS2PH,
- MCVTPS2PH_SAE,
-
- // Galois Field Arithmetic Instructions
- GF2P8AFFINEINVQB,
- GF2P8AFFINEQB,
- GF2P8MULB,
-
- // LWP insert record.
- LWPINS,
-
- // User level wait
- UMWAIT,
- TPAUSE,
-
- // Enqueue Stores Instructions
- ENQCMD,
- ENQCMDS,
-
- // For avx512-vp2intersect
- VP2INTERSECT,
-
- // User level interrupts - testui
- TESTUI,
-
- // Perform an FP80 add after changing precision control in FPCW.
- FP80_ADD,
-
- // Conditional compare instructions
- CCMP,
- CTEST,
-
- /// X86 strict FP compare instructions.
- FIRST_STRICTFP_OPCODE,
- STRICT_FCMP = FIRST_STRICTFP_OPCODE,
- STRICT_FCMPS,
-
- // Vector packed double/float comparison.
- STRICT_CMPP,
-
- /// Vector comparison generating mask bits for fp and
- /// integer signed and unsigned data types.
- STRICT_CMPM,
-
- // Vector float/double to signed/unsigned integer with truncation.
- STRICT_CVTTP2SI,
- STRICT_CVTTP2UI,
-
- // Vector FP extend.
- STRICT_VFPEXT,
-
- // Vector FP round.
- STRICT_VFPROUND,
-
- // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
- // Also used by the legacy (V)ROUND intrinsics where we mask out the
- // scaling part of the immediate.
- STRICT_VRNDSCALE,
-
- // Vector signed/unsigned integer to float/double.
- STRICT_CVTSI2P,
- STRICT_CVTUI2P,
-
- // Strict FMA nodes.
- STRICT_FNMADD,
- STRICT_FMSUB,
- STRICT_FNMSUB,
-
- // Conversions between float and half-float.
- STRICT_CVTPS2PH,
- STRICT_CVTPH2PS,
-
- // Perform an FP80 add after changing precision control in FPCW.
- STRICT_FP80_ADD,
-
- /// Floating point max and min.
- STRICT_FMAX,
- STRICT_FMIN,
- LAST_STRICTFP_OPCODE = STRICT_FMIN,
-
- // Compare and swap.
- FIRST_MEMORY_OPCODE,
- LCMPXCHG_DAG = FIRST_MEMORY_OPCODE,
- LCMPXCHG8_DAG,
- LCMPXCHG16_DAG,
- LCMPXCHG16_SAVE_RBX_DAG,
-
- /// LOCK-prefixed arithmetic read-modify-write instructions.
- /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
- LADD,
- LSUB,
- LOR,
- LXOR,
- LAND,
- LBTS,
- LBTC,
- LBTR,
- LBTS_RM,
- LBTC_RM,
- LBTR_RM,
-
- /// RAO arithmetic instructions.
- /// OUTCHAIN = AADD(INCHAIN, PTR, RHS)
- AADD,
- AOR,
- AXOR,
- AAND,
-
- // Load, scalar_to_vector, and zero extend.
- VZEXT_LOAD,
-
- // extract_vector_elt, store.
- VEXTRACT_STORE,
-
- // scalar broadcast from memory.
- VBROADCAST_LOAD,
-
- // subvector broadcast from memory.
- SUBV_BROADCAST_LOAD,
-
- // Store FP control word into i16 memory.
- FNSTCW16m,
-
- // Load FP control word from i16 memory.
- FLDCW16m,
-
- // Store x87 FPU environment into memory.
- FNSTENVm,
-
- // Load x87 FPU environment from memory.
- FLDENVm,
-
- /// This instruction implements FP_TO_SINT with the
- /// integer destination in memory and a FP reg source. This corresponds
- /// to the X86::FIST*m instructions and the rounding mode change stuff. It
- /// has two inputs (token chain and address) and two outputs (int value
- /// and token chain). Memory VT specifies the type to store to.
- FP_TO_INT_IN_MEM,
-
- /// This instruction implements SINT_TO_FP with the
- /// integer source in memory and FP reg result. This corresponds to the
- /// X86::FILD*m instructions. It has two inputs (token chain and address)
- /// and two outputs (FP value and token chain). The integer source type is
- /// specified by the memory VT.
- FILD,
-
- /// This instruction implements a fp->int store from FP stack
- /// slots. This corresponds to the fist instruction. It takes a
- /// chain operand, value to store, address, and glue. The memory VT
- /// specifies the type to store as.
- FIST,
-
- /// This instruction implements an extending load to FP stack slots.
- /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
- /// operand, and ptr to load from. The memory VT specifies the type to
- /// load from.
- FLD,
-
- /// This instruction implements a truncating store from FP stack
- /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
- /// chain operand, value to store, address, and glue. The memory VT
- /// specifies the type to store as.
- FST,
-
- /// These instructions grab the address of the next argument
- /// from a va_list. (reads and modifies the va_list in memory)
- VAARG_64,
- VAARG_X32,
-
- // Vector truncating store with unsigned/signed saturation
- VTRUNCSTOREUS,
- VTRUNCSTORES,
- // Vector truncating masked store with unsigned/signed saturation
- VMTRUNCSTOREUS,
- VMTRUNCSTORES,
-
- // X86 specific gather and scatter
- MGATHER,
- MSCATTER,
-
- // Key locker nodes that produce flags.
- AESENC128KL,
- AESDEC128KL,
- AESENC256KL,
- AESDEC256KL,
- AESENCWIDE128KL,
- AESDECWIDE128KL,
- AESENCWIDE256KL,
- AESDECWIDE256KL,
-
- /// Compare and Add if Condition is Met. Compare value in operand 2 with
- /// value in memory of operand 1. If condition of operand 4 is met, add
- /// value operand 3 to m32 and write new value in operand 1. Operand 2 is
- /// always updated with the original value from operand 1.
- CMPCCXADD,
-
- // Save xmm argument registers to the stack, according to %al. An operator
- // is needed so that this can be expanded with control flow.
- VASTART_SAVE_XMM_REGS,
-
- // Conditional load/store instructions
- CLOAD,
- CSTORE,
- LAST_MEMORY_OPCODE = CSTORE,
- };
- } // end namespace X86ISD
-
- namespace X86 {
- /// Current rounding mode is represented in bits 11:10 of FPSR. These
- /// values are same as corresponding constants for rounding mode used
- /// in glibc.
- enum RoundingMode {
- rmInvalid = -1, // For handle Invalid rounding mode
- rmToNearest = 0, // FE_TONEAREST
- rmDownward = 1 << 10, // FE_DOWNWARD
- rmUpward = 2 << 10, // FE_UPWARD
- rmTowardZero = 3 << 10, // FE_TOWARDZERO
- rmMask = 3 << 10 // Bit mask selecting rounding mode
- };
+class X86Subtarget;
+class X86TargetMachine;
+
+namespace X86ISD {
+// X86 Specific DAG Nodes
+enum NodeType : unsigned {
+ // Start the numbering where the builtin ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ /// Bit scan forward.
+ BSF,
+ /// Bit scan reverse.
+ BSR,
+
+ /// X86 funnel/double shift i16 instructions. These correspond to
+ /// X86::SHLDW and X86::SHRDW instructions which have different amt
+ /// modulo rules to generic funnel shifts.
+ /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
+ FSHL,
+ FSHR,
+
+ /// Bitwise logical AND of floating point values. This corresponds
+ /// to X86::ANDPS or X86::ANDPD.
+ FAND,
+
+ /// Bitwise logical OR of floating point values. This corresponds
+ /// to X86::ORPS or X86::ORPD.
+ FOR,
+
+ /// Bitwise logical XOR of floating point values. This corresponds
+ /// to X86::XORPS or X86::XORPD.
+ FXOR,
+
+ /// Bitwise logical ANDNOT of floating point values. This
+ /// corresponds to X86::ANDNPS or X86::ANDNPD.
+ FANDN,
+
+ /// These operations represent an abstract X86 call
+ /// instruction, which includes a bunch of information. In particular the
+ /// operands of these node are:
+ ///
+ /// #0 - The incoming token chain
+ /// #1 - The callee
+ /// #2 - The number of arg bytes the caller pushes on the stack.
+ /// #3 - The number of arg bytes the callee pops off the stack.
+ /// #4 - The value to pass in AL/AX/EAX (optional)
+ /// #5 - The value to pass in DL/DX/EDX (optional)
+ ///
+ /// The result values of these nodes are:
+ ///
+ /// #0 - The outgoing token chain
+ /// #1 - The first register result value (optional)
+ /// #2 - The second register result value (optional)
+ ///
+ CALL,
+
+ /// Same as call except it adds the NoTrack prefix.
+ NT_CALL,
+
+ // Pseudo for a OBJC call that gets emitted together with a special
+ // marker instruction.
+ CALL_RVMARKER,
+
+ /// The same as ISD::CopyFromReg except that this node makes it explicit
+ /// that it may lower to an x87 FPU stack pop. Optimizations should be more
+ /// cautious when handling this node than a normal CopyFromReg to avoid
+ /// removing a required FPU stack pop. A key requirement is optimizations
+ /// should not optimize any users of a chain that contains a
+ /// POP_FROM_X87_REG to use a chain from a point earlier than the
+ /// POP_FROM_X87_REG (which may remove a required FPU stack pop).
+ POP_FROM_X87_REG,
+
+ // Pseudo for a call to an imported function to ensure the correct machine
+ // instruction is emitted for Import Call Optimization.
+ IMP_CALL,
+
+ /// X86 compare and logical compare instructions.
+ CMP,
+ FCMP,
+ COMI,
+ UCOMI,
+
+ // X86 compare with Intrinsics similar to COMI.
+ COMX,
+ UCOMX,
+
+ /// X86 bit-test instructions.
+ BT,
+
+ /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+ /// operand, usually produced by a CMP instruction.
+ SETCC,
+
+ /// X86 Select
+ SELECTS,
+
+ /// X86 Constant-time Select, implemented with CMOV instruction. This is
+ /// used to implement constant-time select.
+ CTSELECT,
+
+ // Same as SETCC except it's materialized with a sbb and the value is all
+ // one's or all zero's.
+ SETCC_CARRY, // R = carry_bit ? ~0 : 0
+
+ /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+ /// Operands are two FP values to compare; result is a mask of
+ /// 0s or 1s. Generally DTRT for C/C++ with NaNs.
+ FSETCC,
+
+ /// X86 FP SETCC, similar to above, but with output as an i1 mask and
+ /// and a version with SAE.
+ FSETCCM,
+ FSETCCM_SAE,
+
+ /// X86 conditional moves. Operand 0 and operand 1 are the two values
+ /// to select from. Operand 2 is the condition code, and operand 3 is the
+ /// flag operand produced by a CMP or TEST instruction.
+ CMOV,
+
+ /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+ /// is the block to branch if condition is true, operand 2 is the
+ /// condition code, and operand 3 is the flag operand produced by a CMP
+ /// or TEST instruction.
+ BRCOND,
+
+ /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
+ /// operand 1 is the target address.
+ NT_BRIND,
+
+ /// Return with a glue operand. Operand 0 is the chain operand, operand
+ /// 1 is the number of bytes of stack to pop.
+ RET_GLUE,
+
+ /// Return from interrupt. Operand 0 is the number of bytes to pop.
+ IRET,
+
+ /// Repeat fill, corresponds to X86::REP_STOSx.
+ REP_STOS,
+
+ /// Repeat move, corresponds to X86::REP_MOVSx.
+ REP_MOVS,
+
+ /// On Darwin, this node represents the result of the popl
+ /// at function entry, used for PIC code.
+ GlobalBaseReg,
+
+ /// A wrapper node for TargetConstantPool, TargetJumpTable,
+ /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
+ /// MCSymbol and TargetBlockAddress.
+ Wrapper,
+
+ /// Special wrapper used under X86-64 PIC mode for RIP
+ /// relative displacements.
+ WrapperRIP,
+
+ /// Copies a 64-bit value from an MMX vector to the low word
+ /// of an XMM vector, with the high word zero filled.
+ MOVQ2DQ,
+
+ /// Copies a 64-bit value from the low word of an XMM vector
+ /// to an MMX vector.
+ MOVDQ2Q,
+
+ /// Copies a 32-bit value from the low word of a MMX
+ /// vector to a GPR.
+ MMX_MOVD2W,
+
+ /// Copies a GPR into the low 32-bit word of a MMX vector
+ /// and zero out the high word.
+ MMX_MOVW2D,
+
+ /// Extract an 8-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRB.
+ PEXTRB,
+
+ /// Extract a 16-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRW.
+ PEXTRW,
+
+ /// Insert any element of a 4 x float vector into any element
+ /// of a destination 4 x floatvector.
+ INSERTPS,
+
+ /// Insert the lower 8-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRB.
+ PINSRB,
+
+ /// Insert the lower 16-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRW.
+ PINSRW,
+
+ /// Shuffle 16 8-bit values within a vector.
+ PSHUFB,
+
+ /// Compute Sum of Absolute Differences.
+ PSADBW,
+ /// Compute Double Block Packed Sum-Absolute-Differences
+ DBPSADBW,
+
+ /// Bitwise Logical AND NOT of Packed FP values.
+ ANDNP,
+
+ /// Blend where the selector is an immediate.
+ BLENDI,
+
+ /// Dynamic (non-constant condition) vector blend where only the sign bits
+ /// of the condition elements are used. This is used to enforce that the
+ /// condition mask is not valid for generic VSELECT optimizations. This
+ /// is also used to implement the intrinsics.
+ /// Operands are in VSELECT order: MASK, TRUE, FALSE
+ BLENDV,
+
+ /// Combined add and sub on an FP vector.
+ ADDSUB,
+
+ // FP vector ops with rounding mode.
+ FADD_RND,
+ FADDS,
+ FADDS_RND,
+ FSUB_RND,
+ FSUBS,
+ FSUBS_RND,
+ FMUL_RND,
+ FMULS,
+ FMULS_RND,
+ FDIV_RND,
+ FDIVS,
+ FDIVS_RND,
+ FMAX_SAE,
+ FMAXS_SAE,
+ FMIN_SAE,
+ FMINS_SAE,
+ FSQRT_RND,
+ FSQRTS,
+ FSQRTS_RND,
+
+ // FP vector get exponent.
+ FGETEXP,
+ FGETEXP_SAE,
+ FGETEXPS,
+ FGETEXPS_SAE,
+ // Extract Normalized Mantissas.
+ VGETMANT,
+ VGETMANT_SAE,
+ VGETMANTS,
+ VGETMANTS_SAE,
+ // FP Scale.
+ SCALEF,
+ SCALEF_RND,
+ SCALEFS,
+ SCALEFS_RND,
+
+ /// Integer horizontal add/sub.
+ HADD,
+ HSUB,
+
+ /// Floating point horizontal add/sub.
+ FHADD,
+ FHSUB,
+
+ // Detect Conflicts Within a Vector
+ CONFLICT,
+
+ /// Floating point max and min.
+ FMAX,
+ FMIN,
+
+ /// Commutative FMIN and FMAX.
+ FMAXC,
+ FMINC,
+
+ /// Scalar intrinsic floating point max and min.
+ FMAXS,
+ FMINS,
+
+ /// Floating point reciprocal-sqrt and reciprocal approximation.
+ /// Note that these typically require refinement
+ /// in order to obtain suitable precision.
+ FRSQRT,
+ FRCP,
+
+ // AVX-512 reciprocal approximations with a little more precision.
+ RSQRT14,
+ RSQRT14S,
+ RCP14,
+ RCP14S,
+
+ // Thread Local Storage.
+ TLSADDR,
+
+ // Thread Local Storage. A call to get the start address
+ // of the TLS block for the current module.
+ TLSBASEADDR,
+
+ // Thread Local Storage. When calling to an OS provided
+ // thunk at the address from an earlier relocation.
+ TLSCALL,
+
+ // Thread Local Storage. A descriptor containing pointer to
+ // code and to argument to get the TLS offset for the symbol.
+ TLSDESC,
+
+ // Exception Handling helpers.
+ EH_RETURN,
+
+ // SjLj exception handling setjmp.
+ EH_SJLJ_SETJMP,
+
+ // SjLj exception handling longjmp.
+ EH_SJLJ_LONGJMP,
+
+ // SjLj exception handling dispatch.
+ EH_SJLJ_SETUP_DISPATCH,
+
+ /// Tail call return. See X86TargetLowering::LowerCall for
+ /// the list of operands.
+ TC_RETURN,
+
+ // Vector move to low scalar and zero higher vector elements.
+ VZEXT_MOVL,
+
+ // Vector integer truncate.
+ VTRUNC,
+ // Vector integer truncate with unsigned/signed saturation.
+ VTRUNCUS,
+ VTRUNCS,
+
+ // Masked version of the above. Used when less than a 128-bit result is
+ // produced since the mask only applies to the lower elements and can't
+ // be represented by a select.
+ // SRC, PASSTHRU, MASK
+ VMTRUNC,
+ VMTRUNCUS,
+ VMTRUNCS,
+
+ // Vector FP extend.
+ VFPEXT,
+ VFPEXT_SAE,
+ VFPEXTS,
+ VFPEXTS_SAE,
+
+ // Vector FP round.
+ VFPROUND,
+ // Convert TWO packed single data to one packed data
+ VFPROUND2,
+ VFPROUND2_RND,
+ VFPROUND_RND,
+ VFPROUNDS,
+ VFPROUNDS_RND,
+
+ // Masked version of above. Used for v2f64->v4f32.
+ // SRC, PASSTHRU, MASK
+ VMFPROUND,
+
+ // 128-bit vector logical left / right shift
+ VSHLDQ,
+ VSRLDQ,
+
+ // Vector shift elements
+ VSHL,
+ VSRL,
+ VSRA,
+
+ // Vector variable shift
+ VSHLV,
+ VSRLV,
+ VSRAV,
+
+ // Vector shift elements by immediate
+ VSHLI,
+ VSRLI,
+ VSRAI,
+
+ // Shifts of mask registers.
+ KSHIFTL,
+ KSHIFTR,
+
+ // Bit rotate by immediate
+ VROTLI,
+ VROTRI,
+
+ // Vector packed double/float comparison.
+ CMPP,
+
+ // Vector integer comparisons.
+ PCMPEQ,
+ PCMPGT,
+
+ // v8i16 Horizontal minimum and position.
+ PHMINPOS,
+
+ MULTISHIFT,
+
+ /// Vector comparison generating mask bits for fp and
+ /// integer signed and unsigned data types.
+ CMPM,
+ // Vector mask comparison generating mask bits for FP values.
+ CMPMM,
+ // Vector mask comparison with SAE for FP values.
+ CMPMM_SAE,
+
+ // Arithmetic operations with FLAGS results.
+ ADD,
+ SUB,
+ ADC,
+ SBB,
+ SMUL,
+ UMUL,
+ OR,
+ XOR,
+ AND,
+
+ // Bit field extract.
+ BEXTR,
+ BEXTRI,
+
+ // Zero High Bits Starting with Specified Bit Position.
+ BZHI,
+
+ // Parallel extract and deposit.
+ PDEP,
+ PEXT,
+
+ // X86-specific multiply by immediate.
+ MUL_IMM,
+
+ // Vector sign bit extraction.
+ MOVMSK,
+
+ // Vector bitwise comparisons.
+ PTEST,
+
+ // Vector packed fp sign bitwise comparisons.
+ TESTP,
+
+ // OR/AND test for masks.
+ KORTEST,
+ KTEST,
+
+ // ADD for masks.
+ KADD,
+
+ // Several flavors of instructions with vector shuffle behaviors.
+ // Saturated signed/unnsigned packing.
+ PACKSS,
+ PACKUS,
+ // Intra-lane alignr.
+ PALIGNR,
+ // AVX512 inter-lane alignr.
+ VALIGN,
+ PSHUFD,
+ PSHUFHW,
+ PSHUFLW,
+ SHUFP,
+ // VBMI2 Concat & Shift.
+ VSHLD,
+ VSHRD,
+
+ // Shuffle Packed Values at 128-bit granularity.
+ SHUF128,
+ MOVDDUP,
+ MOVSHDUP,
+ MOVSLDUP,
+ MOVLHPS,
+ MOVHLPS,
+ MOVSD,
+ MOVSS,
+ MOVSH,
+ UNPCKL,
+ UNPCKH,
+ VPERMILPV,
+ VPERMILPI,
+ VPERMI,
+ VPERM2X128,
+
+ // Variable Permute (VPERM).
+ // Res = VPERMV MaskV, V0
+ VPERMV,
+
+ // 3-op Variable Permute (VPERMT2).
+ // Res = VPERMV3 V0, MaskV, V1
+ VPERMV3,
+
+ // Bitwise ternary logic.
+ VPTERNLOG,
+ // Fix Up Special Packed Float32/64 values.
+ VFIXUPIMM,
+ VFIXUPIMM_SAE,
+ VFIXUPIMMS,
+ VFIXUPIMMS_SAE,
+ // Range Restriction Calculation For Packed Pairs of Float32/64 values.
+ VRANGE,
+ VRANGE_SAE,
+ VRANGES,
+ VRANGES_SAE,
+ // Reduce - Perform Reduction Transformation on scalar\packed FP.
+ VREDUCE,
+ VREDUCE_SAE,
+ VREDUCES,
+ VREDUCES_SAE,
+ // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+ // Also used by the legacy (V)ROUND intrinsics where we mask out the
+ // scaling part of the immediate.
+ VRNDSCALE,
+ VRNDSCALE_SAE,
+ VRNDSCALES,
+ VRNDSCALES_SAE,
+ // Tests Types Of a FP Values for packed types.
+ VFPCLASS,
+ // Tests Types Of a FP Values for scalar types.
+ VFPCLASSS,
+
+ // Broadcast (splat) scalar or element 0 of a vector. If the operand is
+ // a vector, this node may change the vector length as part of the splat.
+ VBROADCAST,
+ // Broadcast mask to vector.
+ VBROADCASTM,
+
+ /// SSE4A Extraction and Insertion.
+ EXTRQI,
+ INSERTQI,
+
+ // XOP arithmetic/logical shifts.
+ VPSHA,
+ VPSHL,
+ // XOP signed/unsigned integer comparisons.
+ VPCOM,
+ VPCOMU,
+ // XOP packed permute bytes.
+ VPPERM,
+ // XOP two source permutation.
+ VPERMIL2,
+
+ // Vector multiply packed unsigned doubleword integers.
+ PMULUDQ,
+ // Vector multiply packed signed doubleword integers.
+ PMULDQ,
+ // Vector Multiply Packed UnsignedIntegers with Round and Scale.
+ MULHRS,
+
+ // Multiply and Add Packed Integers.
+ VPMADDUBSW,
+ VPMADDWD,
+
+ // AVX512IFMA multiply and add.
+ // NOTE: These are different than the instruction and perform
+ // op0 x op1 + op2.
+ VPMADD52L,
+ VPMADD52H,
+
+ // VNNI
+ VPDPBUSD,
+ VPDPBUSDS,
+ VPDPWSSD,
+ VPDPWSSDS,
+
+ // FMA nodes.
+ // We use the target independent ISD::FMA for the non-inverted case.
+ FNMADD,
+ FMSUB,
+ FNMSUB,
+ FMADDSUB,
+ FMSUBADD,
+
+ // FMA with rounding mode.
+ FMADD_RND,
+ FNMADD_RND,
+ FMSUB_RND,
+ FNMSUB_RND,
+ FMADDSUB_RND,
+ FMSUBADD_RND,
+
+ // AVX512-FP16 complex addition and multiplication.
+ VFMADDC,
+ VFMADDC_RND,
+ VFCMADDC,
+ VFCMADDC_RND,
+
+ VFMULC,
+ VFMULC_RND,
+ VFCMULC,
+ VFCMULC_RND,
+
+ VFMADDCSH,
+ VFMADDCSH_RND,
+ VFCMADDCSH,
+ VFCMADDCSH_RND,
+
+ VFMULCSH,
+ VFMULCSH_RND,
+ VFCMULCSH,
+ VFCMULCSH_RND,
+
+ VPDPBSUD,
+ VPDPBSUDS,
+ VPDPBUUD,
+ VPDPBUUDS,
+ VPDPBSSD,
+ VPDPBSSDS,
+
+ VPDPWSUD,
+ VPDPWSUDS,
+ VPDPWUSD,
+ VPDPWUSDS,
+ VPDPWUUD,
+ VPDPWUUDS,
+
+ VMINMAX,
+ VMINMAX_SAE,
+ VMINMAXS,
+ VMINMAXS_SAE,
+
+ CVTP2IBS,
+ CVTP2IUBS,
+ CVTP2IBS_RND,
+ CVTP2IUBS_RND,
+ CVTTP2IBS,
+ CVTTP2IUBS,
+ CVTTP2IBS_SAE,
+ CVTTP2IUBS_SAE,
+
+ MPSADBW,
+
+ VCVT2PH2BF8,
+ VCVT2PH2BF8S,
+ VCVT2PH2HF8,
+ VCVT2PH2HF8S,
+ VCVTBIASPH2BF8,
+ VCVTBIASPH2BF8S,
+ VCVTBIASPH2HF8,
+ VCVTBIASPH2HF8S,
+ VCVTPH2BF8,
+ VCVTPH2BF8S,
+ VCVTPH2HF8,
+ VCVTPH2HF8S,
+ VMCVTBIASPH2BF8,
+ VMCVTBIASPH2BF8S,
+ VMCVTBIASPH2HF8,
+ VMCVTBIASPH2HF8S,
+ VMCVTPH2BF8,
+ VMCVTPH2BF8S,
+ VMCVTPH2HF8,
+ VMCVTPH2HF8S,
+ VCVTHF82PH,
+
+ // Compress and expand.
+ COMPRESS,
+ EXPAND,
+
+ // Bits shuffle
+ VPSHUFBITQMB,
+
+ // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
+ SINT_TO_FP_RND,
+ UINT_TO_FP_RND,
+ SCALAR_SINT_TO_FP,
+ SCALAR_UINT_TO_FP,
+ SCALAR_SINT_TO_FP_RND,
+ SCALAR_UINT_TO_FP_RND,
+
+ // Vector float/double to signed/unsigned integer.
+ CVTP2SI,
+ CVTP2UI,
+ CVTP2SI_RND,
+ CVTP2UI_RND,
+ // Scalar float/double to signed/unsigned integer.
+ CVTS2SI,
+ CVTS2UI,
+ CVTS2SI_RND,
+ CVTS2UI_RND,
+
+ // Vector float/double to signed/unsigned integer with truncation.
+ CVTTP2SI,
+ CVTTP2UI,
+ CVTTP2SI_SAE,
+ CVTTP2UI_SAE,
+
+ // Saturation enabled Vector float/double to signed/unsigned
+ // integer with truncation.
+ CVTTP2SIS,
+ CVTTP2UIS,
+ CVTTP2SIS_SAE,
+ CVTTP2UIS_SAE,
+ // Masked versions of above. Used for v2f64 to v4i32.
+ // SRC, PASSTHRU, MASK
+ MCVTTP2SIS,
+ MCVTTP2UIS,
+
+ // Scalar float/double to signed/unsigned integer with truncation.
+ CVTTS2SI,
+ CVTTS2UI,
+ CVTTS2SI_SAE,
+ CVTTS2UI_SAE,
+
+ // Vector signed/unsigned integer to float/double.
+ CVTSI2P,
+ CVTUI2P,
+
+ // Scalar float/double to signed/unsigned integer with saturation.
+ CVTTS2SIS,
+ CVTTS2UIS,
+ CVTTS2SIS_SAE,
+ CVTTS2UIS_SAE,
+
+ // Masked versions of above. Used for v2f64->v4f32.
+ // SRC, PASSTHRU, MASK
+ MCVTP2SI,
+ MCVTP2UI,
+ MCVTTP2SI,
+ MCVTTP2UI,
+ MCVTSI2P,
+ MCVTUI2P,
+
+ // Custom handling for FP_TO_xINT_SAT
+ FP_TO_SINT_SAT,
+ FP_TO_UINT_SAT,
+
+ // Vector float to bfloat16.
+ // Convert packed single data to packed BF16 data
+ CVTNEPS2BF16,
+ // Masked version of above.
+ // SRC, PASSTHRU, MASK
+ MCVTNEPS2BF16,
+
+ // Dot product of BF16/FP16 pairs to accumulated into
+ // packed single precision.
+ DPBF16PS,
+ DPFP16PS,
+
+ // A stack checking function call. On Windows it's _chkstk call.
+ DYN_ALLOCA,
+
+ // For allocating variable amounts of stack space when using
+ // segmented stacks. Check if the current stacklet has enough space, and
+ // falls back to heap allocation if not.
+ SEG_ALLOCA,
+
+ // For allocating stack space when using stack clash protector.
+ // Allocation is performed by block, and each block is probed.
+ PROBED_ALLOCA,
+
+ // Memory barriers.
+ MFENCE,
+
+ // Get a random integer and indicate whether it is valid in CF.
+ RDRAND,
+
+ // Get a NIST SP800-90B & C compliant random integer and
+ // indicate whether it is valid in CF.
+ RDSEED,
+
+ // Protection keys
+ // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
+ // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
+ // value for ECX.
+ RDPKRU,
+ WRPKRU,
+
+ // SSE42 string comparisons.
+ // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
+ // will emit one or two instructions based on which results are used. If
+ // flags and index/mask this allows us to use a single instruction since
+ // we won't have to pick and opcode for flags. Instead we can rely on the
+ // DAG to CSE everything and decide at isel.
+ PCMPISTR,
+ PCMPESTR,
+
+ // Test if in transactional execution.
+ XTEST,
+
+ // Conversions between float and half-float.
+ CVTPS2PH,
+ CVTPS2PH_SAE,
+ CVTPH2PS,
+ CVTPH2PS_SAE,
+
+ // Masked version of above.
+ // SRC, RND, PASSTHRU, MASK
+ MCVTPS2PH,
+ MCVTPS2PH_SAE,
+
+ // Galois Field Arithmetic Instructions
+ GF2P8AFFINEINVQB,
+ GF2P8AFFINEQB,
+ GF2P8MULB,
+
+ // LWP insert record.
+ LWPINS,
+
+ // User level wait
+ UMWAIT,
+ TPAUSE,
+
+ // Enqueue Stores Instructions
+ ENQCMD,
+ ENQCMDS,
+
+ // For avx512-vp2intersect
+ VP2INTERSECT,
+
+ // User level interrupts - testui
+ TESTUI,
+
+ // Perform an FP80 add after changing precision control in FPCW.
+ FP80_ADD,
+
+ // Conditional compare instructions
+ CCMP,
+ CTEST,
+
+ /// X86 strict FP compare instructions.
+ FIRST_STRICTFP_OPCODE,
+ STRICT_FCMP = FIRST_STRICTFP_OPCODE,
+ STRICT_FCMPS,
+
+ // Vector packed double/float comparison.
+ STRICT_CMPP,
+
+ /// Vector comparison generating mask bits for fp and
+ /// integer signed and unsigned data types.
+ STRICT_CMPM,
+
+ // Vector float/double to signed/unsigned integer with truncation.
+ STRICT_CVTTP2SI,
+ STRICT_CVTTP2UI,
+
+ // Vector FP extend.
+ STRICT_VFPEXT,
+
+ // Vector FP round.
+ STRICT_VFPROUND,
+
+ // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+ // Also used by the legacy (V)ROUND intrinsics where we mask out the
+ // scaling part of the immediate.
+ STRICT_VRNDSCALE,
+
+ // Vector signed/unsigned integer to float/double.
+ STRICT_CVTSI2P,
+ STRICT_CVTUI2P,
+
+ // Strict FMA nodes.
+ STRICT_FNMADD,
+ STRICT_FMSUB,
+ STRICT_FNMSUB,
+
+ // Conversions between float and half-float.
+ STRICT_CVTPS2PH,
+ STRICT_CVTPH2PS,
+
+ // Perform an FP80 add after changing precision control in FPCW.
+ STRICT_FP80_ADD,
+
+ /// Floating point max and min.
+ STRICT_FMAX,
+ STRICT_FMIN,
+ LAST_STRICTFP_OPCODE = STRICT_FMIN,
+
+ // Compare and swap.
+ FIRST_MEMORY_OPCODE,
+ LCMPXCHG_DAG = FIRST_MEMORY_OPCODE,
+ LCMPXCHG8_DAG,
+ LCMPXCHG16_DAG,
+ LCMPXCHG16_SAVE_RBX_DAG,
+
+ /// LOCK-prefixed arithmetic read-modify-write instructions.
+ /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
+ LADD,
+ LSUB,
+ LOR,
+ LXOR,
+ LAND,
+ LBTS,
+ LBTC,
+ LBTR,
+ LBTS_RM,
+ LBTC_RM,
+ LBTR_RM,
+
+ /// RAO arithmetic instructions.
+ /// OUTCHAIN = AADD(INCHAIN, PTR, RHS)
+ AADD,
+ AOR,
+ AXOR,
+ AAND,
+
+ // Load, scalar_to_vector, and zero extend.
+ VZEXT_LOAD,
+
+ // extract_vector_elt, store.
+ VEXTRACT_STORE,
+
+ // scalar broadcast from memory.
+ VBROADCAST_LOAD,
+
+ // subvector broadcast from memory.
+ SUBV_BROADCAST_LOAD,
+
+ // Store FP control word into i16 memory.
+ FNSTCW16m,
+
+ // Load FP control word from i16 memory.
+ FLDCW16m,
+
+ // Store x87 FPU environment into memory.
+ FNSTENVm,
+
+ // Load x87 FPU environment from memory.
+ FLDENVm,
+
+ /// This instruction implements FP_TO_SINT with the
+ /// integer destination in memory and a FP reg source. This corresponds
+ /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+ /// has two inputs (token chain and address) and two outputs (int value
+ /// and token chain). Memory VT specifies the type to store to.
+ FP_TO_INT_IN_MEM,
+
+ /// This instruction implements SINT_TO_FP with the
+ /// integer source in memory and FP reg result. This corresponds to the
+ /// X86::FILD*m instructions. It has two inputs (token chain and address)
+ /// and two outputs (FP value and token chain). The integer source type is
+ /// specified by the memory VT.
+ FILD,
+
+ /// This instruction implements a fp->int store from FP stack
+ /// slots. This corresponds to the fist instruction. It takes a
+ /// chain operand, value to store, address, and glue. The memory VT
+ /// specifies the type to store as.
+ FIST,
+
+ /// This instruction implements an extending load to FP stack slots.
+ /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+ /// operand, and ptr to load from. The memory VT specifies the type to
+ /// load from.
+ FLD,
+
+ /// This instruction implements a truncating store from FP stack
+ /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+ /// chain operand, value to store, address, and glue. The memory VT
+ /// specifies the type to store as.
+ FST,
+
+ /// These instructions grab the address of the next argument
+ /// from a va_list. (reads and modifies the va_list in memory)
+ VAARG_64,
+ VAARG_X32,
+
+ // Vector truncating store with unsigned/signed saturation
+ VTRUNCSTOREUS,
+ VTRUNCSTORES,
+ // Vector truncating masked store with unsigned/signed saturation
+ VMTRUNCSTOREUS,
+ VMTRUNCSTORES,
+
+ // X86 specific gather and scatter
+ MGATHER,
+ MSCATTER,
+
+ // Key locker nodes that produce flags.
+ AESENC128KL,
+ AESDEC128KL,
+ AESENC256KL,
+ AESDEC256KL,
+ AESENCWIDE128KL,
+ AESDECWIDE128KL,
+ AESENCWIDE256KL,
+ AESDECWIDE256KL,
+
+ /// Compare and Add if Condition is Met. Compare value in operand 2 with
+ /// value in memory of operand 1. If condition of operand 4 is met, add
+ /// value operand 3 to m32 and write new value in operand 1. Operand 2 is
+ /// always updated with the original value from operand 1.
+ CMPCCXADD,
+
+ // Save xmm argument registers to the stack, according to %al. An operator
+ // is needed so that this can be expanded with control flow.
+ VASTART_SAVE_XMM_REGS,
+
+ // Conditional load/store instructions
+ CLOAD,
+ CSTORE,
+ LAST_MEMORY_OPCODE = CSTORE,
+};
+} // end namespace X86ISD
+
+namespace X86 {
+/// Current rounding mode is represented in bits 11:10 of FPSR. These
+/// values are same as corresponding constants for rounding mode used
+/// in glibc.
+enum RoundingMode {
+ rmInvalid = -1, // For handle Invalid rounding mode
+ rmToNearest = 0, // FE_TONEAREST
+ rmDownward = 1 << 10, // FE_DOWNWARD
+ rmUpward = 2 << 10, // FE_UPWARD
+ rmTowardZero = 3 << 10, // FE_TOWARDZERO
+ rmMask = 3 << 10 // Bit mask selecting rounding mode
+};
+} // namespace X86
+
+/// Define some predicates that are used for node matching.
+namespace X86 {
+/// Returns true if Elt is a constant zero or floating point constant +0.0.
+bool isZeroNode(SDValue Elt);
+
+/// Returns true of the given offset can be
+/// fit into displacement field of the instruction.
+bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+ bool hasSymbolicDisplacement);
+
+/// Determines whether the callee is required to pop its
+/// own arguments. Callee pop is necessary to support tail calls.
+bool isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg,
+ bool GuaranteeTCO);
+
+/// If Op is a constant whose elements are all the same constant or
+/// undefined, return true and return the constant value in \p SplatVal.
+/// If we have undef bits that don't cover an entire element, we treat these
+/// as zero if AllowPartialUndefs is set, else we fail and return false.
+bool isConstantSplat(SDValue Op, APInt &SplatVal,
+ bool AllowPartialUndefs = true);
+
+/// Check if Op is a load operation that could be folded into some other x86
+/// instruction as a memory operand. Example: vpaddd (%rdi), %xmm0, %xmm0.
+bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
+ bool AssumeSingleUse = false);
+
+/// Check if Op is a load operation that could be folded into a vector splat
+/// instruction as a memory operand. Example: vbroadcastss 16(%rdi), %xmm2.
+bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
+ const X86Subtarget &Subtarget,
+ bool AssumeSingleUse = false);
+
+/// Check if Op is a value that could be used to fold a store into some
+/// other x86 instruction as a memory operand. Ex: pextrb $0, %xmm0, (%rdi).
+bool mayFoldIntoStore(SDValue Op);
+
+/// Check if Op is an operation that could be folded into a zero extend x86
+/// instruction.
+bool mayFoldIntoZeroExtend(SDValue Op);
+
+/// True if the target supports the extended frame for async Swift
+/// functions.
+bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget,
+ const MachineFunction &MF);
+
+/// Convert LLVM rounding mode to X86 rounding mode.
+int getRoundingModeX86(unsigned RM);
+
+} // end namespace X86
+
+//===--------------------------------------------------------------------===//
+// X86 Implementation of the TargetLowering interface
+class X86TargetLowering final : public TargetLowering {
+public:
+ explicit X86TargetLowering(const X86TargetMachine &TM,
+ const X86Subtarget &STI);
+
+ unsigned getJumpTableEncoding() const override;
+ bool useSoftFloat() const override;
+
+ void markLibCallAttributes(MachineFunction *MF, unsigned CC,
+ ArgListTy &Args) const override;
+
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
+ return MVT::i8;
}
- /// Define some predicates that are used for node matching.
- namespace X86 {
- /// Returns true if Elt is a constant zero or floating point constant +0.0.
- bool isZeroNode(SDValue Elt);
-
- /// Returns true of the given offset can be
- /// fit into displacement field of the instruction.
- bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
- bool hasSymbolicDisplacement);
-
- /// Determines whether the callee is required to pop its
- /// own arguments. Callee pop is necessary to support tail calls.
- bool isCalleePop(CallingConv::ID CallingConv,
- bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
-
- /// If Op is a constant whose elements are all the same constant or
- /// undefined, return true and return the constant value in \p SplatVal.
- /// If we have undef bits that don't cover an entire element, we treat these
- /// as zero if AllowPartialUndefs is set, else we fail and return false.
- bool isConstantSplat(SDValue Op, APInt &SplatVal,
- bool AllowPartialUndefs = true);
-
- /// Check if Op is a load operation that could be folded into some other x86
- /// instruction as a memory operand. Example: vpaddd (%rdi), %xmm0, %xmm0.
- bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
- bool AssumeSingleUse = false);
-
- /// Check if Op is a load operation that could be folded into a vector splat
- /// instruction as a memory operand. Example: vbroadcastss 16(%rdi), %xmm2.
- bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
- const X86Subtarget &Subtarget,
- bool AssumeSingleUse = false);
-
- /// Check if Op is a value that could be used to fold a store into some
- /// other x86 instruction as a memory operand. Ex: pextrb $0, %xmm0, (%rdi).
- bool mayFoldIntoStore(SDValue Op);
-
- /// Check if Op is an operation that could be folded into a zero extend x86
- /// instruction.
- bool mayFoldIntoZeroExtend(SDValue Op);
-
- /// True if the target supports the extended frame for async Swift
- /// functions.
- bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget,
- const MachineFunction &MF);
-
- /// Convert LLVM rounding mode to X86 rounding mode.
- int getRoundingModeX86(unsigned RM);
-
- } // end namespace X86
-
- //===--------------------------------------------------------------------===//
- // X86 Implementation of the TargetLowering interface
- class X86TargetLowering final : public TargetLowering {
- public:
- explicit X86TargetLowering(const X86TargetMachine &TM,
- const X86Subtarget &STI);
-
- unsigned getJumpTableEncoding() const override;
- bool useSoftFloat() const override;
-
- void markLibCallAttributes(MachineFunction *MF, unsigned CC,
- ArgListTy &Args) const override;
-
- MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
- return MVT::i8;
- }
-
- const MCExpr *
- LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
- const MachineBasicBlock *MBB, unsigned uid,
- MCContext &Ctx) const override;
-
- /// Returns relocation base for the given PIC jumptable.
- SDValue getPICJumpTableRelocBase(SDValue Table,
- SelectionDAG &DAG) const override;
- const MCExpr *
- getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
- unsigned JTI, MCContext &Ctx) const override;
-
- /// Return the desired alignment for ByVal aggregate
- /// function arguments in the caller parameter area. For X86, aggregates
- /// that contains are placed at 16-byte boundaries while the rest are at
- /// 4-byte boundaries.
- Align getByValTypeAlignment(Type *Ty, const DataLayout &DL) const override;
-
- EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op,
- const AttributeList &FuncAttributes) const override;
-
- /// Returns true if it's safe to use load / store of the
- /// specified type to expand memcpy / memset inline. This is mostly true
- /// for all types except for some special cases. For example, on X86
- /// targets without SSE2 f64 load / store are done with fldl / fstpl which
- /// also does type conversion. Note the specified type doesn't have to be
- /// legal as the hook is used before type legalization.
- bool isSafeMemOpType(MVT VT) const override;
-
- bool isMemoryAccessFast(EVT VT, Align Alignment) const;
-
- /// Returns true if the target allows unaligned memory accesses of the
- /// specified type. Returns whether it is "fast" in the last argument.
- bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
- MachineMemOperand::Flags Flags,
- unsigned *Fast) const override;
-
- /// This function returns true if the memory access is aligned or if the
- /// target allows this specific unaligned memory access. If the access is
- /// allowed, the optional final parameter returns a relative speed of the
- /// access (as defined by the target).
- bool allowsMemoryAccess(
- LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
- Align Alignment,
- MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
- unsigned *Fast = nullptr) const override;
-
- bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
- const MachineMemOperand &MMO,
- unsigned *Fast) const {
- return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(),
- MMO.getAlign(), MMO.getFlags(), Fast);
- }
-
- /// Provide custom lowering hooks for some operations.
- ///
- SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-
- bool isSelectSupported(SelectSupportKind Kind) const override;
-
- /// Replace the results of node with an illegal result
- /// type with new values built out of custom code.
- ///
- void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
- SelectionDAG &DAG) const override;
-
- SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-
- bool preferABDSToABSWithNSW(EVT VT) const override;
-
- bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT,
- EVT ExtVT) const override;
-
- bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond,
- EVT VT) const override;
-
- /// Return true if the target has native support for
- /// the specified value type and it is 'desirable' to use the type for the
- /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
- /// instruction encodings are longer and some i16 instructions are slow.
- bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
-
- /// Return true if the target has native support for the
- /// specified value type and it is 'desirable' to use the type. e.g. On x86
- /// i16 is legal, but undesirable since i16 instruction encodings are longer
- /// and some i16 instructions are slow.
- bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
-
- /// Return prefered fold type, Abs if this is a vector, AddAnd if its an
- /// integer, None otherwise.
- TargetLowering::AndOrSETCCFoldKind
- isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp,
- const SDNode *SETCC0,
- const SDNode *SETCC1) const override;
-
- /// Return the newly negated expression if the cost is not expensive and
- /// set the cost in \p Cost to indicate that if it is cheaper or neutral to
- /// do the negation.
- SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
- bool LegalOperations, bool ForCodeSize,
- NegatibleCost &Cost,
- unsigned Depth) const override;
+ const MCExpr *LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+ const MachineBasicBlock *MBB,
+ unsigned uid,
+ MCContext &Ctx) const override;
+
+ /// Returns relocation base for the given PIC jumptable.
+ SDValue getPICJumpTableRelocBase(SDValue Table,
+ SelectionDAG &DAG) const override;
+ const MCExpr *getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+ unsigned JTI,
+ MCContext &Ctx) const override;
+
+ /// Return the desired alignment for ByVal aggregate
+ /// function arguments in the caller parameter area. For X86, aggregates
+ /// that contains are placed at 16-byte boundaries while the rest are at
+ /// 4-byte boundaries.
+ Align getByValTypeAlignment(Type *Ty, const DataLayout &DL) const override;
+
+ EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op,
+ const AttributeList &FuncAttributes) const override;
+
+ /// Returns true if it's safe to use load / store of the
+ /// specified type to expand memcpy / memset inline. This is mostly true
+ /// for all types except for some special cases. For example, on X86
+ /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+ /// also does type conversion. Note the specified type doesn't have to be
+ /// legal as the hook is used before type legalization.
+ bool isSafeMemOpType(MVT VT) const override;
+
+ bool isMemoryAccessFast(EVT VT, Align Alignment) const;
+
+ /// Returns true if the target allows unaligned memory accesses of the
+ /// specified type. Returns whether it is "fast" in the last argument.
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
+ MachineMemOperand::Flags Flags,
+ unsigned *Fast) const override;
+
+ /// This function returns true if the memory access is aligned or if the
+ /// target allows this specific unaligned memory access. If the access is
+ /// allowed, the optional final parameter returns a relative speed of the
+ /// access (as defined by the target).
+ bool
+ allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
+ unsigned AddrSpace, Align Alignment,
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+ unsigned *Fast = nullptr) const override;
+
+ bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
+ const MachineMemOperand &MMO, unsigned *Fast) const {
+ return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(),
+ MMO.getAlign(), MMO.getFlags(), Fast);
+ }
+
+ /// Provide custom lowering hooks for some operations.
+ ///
+ SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+ bool isSelectSupported(SelectSupportKind Kind) const override;
+
+ /// Replace the results of node with an illegal result
+ /// type with new values built out of custom code.
+ ///
+ void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+
+ SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+ bool preferABDSToABSWithNSW(EVT VT) const override;
+
+ bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override;
+
+ bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond,
+ EVT VT) const override;
+
+ /// Return true if the target has native support for
+ /// the specified value type and it is 'desirable' to use the type for the
+ /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+ /// instruction encodings are longer and some i16 instructions are slow.
+ bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
+
+ /// Return true if the target has native support for the
+ /// specified value type and it is 'desirable' to use the type. e.g. On x86
+ /// i16 is legal, but undesirable since i16 instruction encodings are longer
+ /// and some i16 instructions are slow.
+ bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
+
+ /// Return prefered fold type, Abs if this is a vector, AddAnd if its an
+ /// integer, None otherwise.
+ TargetLowering::AndOrSETCCFoldKind
+ isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp,
+ const SDNode *SETCC0,
+ const SDNode *SETCC1) const override;
+
+ /// Return the newly negated expression if the cost is not expensive and
+ /// set the cost in \p Cost to indicate that if it is cheaper or neutral to
+ /// do the negation.
+ SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+ bool LegalOperations, bool ForCodeSize,
+ NegatibleCost &Cost,
+ unsigned Depth) const override;
+
+ MachineBasicBlock *
+ EmitInstrWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const override;
+
+ /// This method returns the name of a target specific DAG node.
+ const char *getTargetNodeName(unsigned Opcode) const override;
+
+ /// Do not merge vector stores after legalization because that may conflict
+ /// with x86-specific store splitting optimizations.
+ bool mergeStoresAfterLegalization(EVT MemVT) const override {
+ return !MemVT.isVector();
+ }
+
+ bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+ const MachineFunction &MF) const override;
+
+ bool isCheapToSpeculateCttz(Type *Ty) const override;
+
+ bool isCheapToSpeculateCtlz(Type *Ty) const override;
+
+ bool isCtlzFast() const override;
+
+ bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
+ // If the pair to store is a mixture of float and int values, we will
+ // save two bitwise instructions and one float-to-int instruction and
+ // increase one store instruction. There is potentially a more
+ // significant benefit because it avoids the float->int domain switch
+ // for input value. So It is more likely a win.
+ if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
+ (LTy.isInteger() && HTy.isFloatingPoint()))
+ return true;
+ // If the pair only contains int values, we will save two bitwise
+ // instructions and increase one store instruction (costing one more
+ // store buffer). Since the benefit is more blurred so we leave
+ // such pair out until we get testcase to prove it is a win.
+ return false;
+ }
+
+ bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
+ bool hasAndNotCompare(SDValue Y) const override;
+
+ bool hasAndNot(SDValue Y) const override;
+
+ bool hasBitTest(SDValue X, SDValue Y) const override;
- MachineBasicBlock *
- EmitInstrWithCustomInserter(MachineInstr &MI,
- MachineBasicBlock *MBB) const override;
-
- /// This method returns the name of a target specific DAG node.
- const char *getTargetNodeName(unsigned Opcode) const override;
-
- /// Do not merge vector stores after legalization because that may conflict
- /// with x86-specific store splitting optimizations.
- bool mergeStoresAfterLegalization(EVT MemVT) const override {
- return !MemVT.isVector();
- }
-
- bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
- const MachineFunction &MF) const override;
-
- bool isCheapToSpeculateCttz(Type *Ty) const override;
-
- bool isCheapToSpeculateCtlz(Type *Ty) const override;
-
- bool isCtlzFast() const override;
-
- bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
- // If the pair to store is a mixture of float and int values, we will
- // save two bitwise instructions and one float-to-int instruction and
- // increase one store instruction. There is potentially a more
- // significant benefit because it avoids the float->int domain switch
- // for input value. So It is more likely a win.
- if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
- (LTy.isInteger() && HTy.isFloatingPoint()))
- return true;
- // If the pair only contains int values, we will save two bitwise
- // instructions and increase one store instruction (costing one more
- // store buffer). Since the benefit is more blurred so we leave
- // such pair out until we get testcase to prove it is a win.
+ bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+ SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+ unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+ SelectionDAG &DAG) const override;
+
+ unsigned preferedOpcodeForCmpEqPiecesOfOperand(
+ EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
+ const APInt &ShiftOrRotateAmt,
+ const std::optional<APInt> &AndMask) const override;
+
+ bool preferScalarizeSplat(SDNode *N) const override;
+
+ CondMergingParams
+ getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
+ const Value *Rhs) const override;
+
+ bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override;
+
+ bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
+
+ bool shouldTransformSignedTruncationCheck(EVT XVT,
+ unsigned KeptBits) const override {
+ // For vectors, we don't have a preference..
+ if (XVT.isVector())
return false;
- }
- bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
-
- bool hasAndNotCompare(SDValue Y) const override;
-
- bool hasAndNot(SDValue Y) const override;
-
- bool hasBitTest(SDValue X, SDValue Y) const override;
-
- bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
- SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
- unsigned OldShiftOpcode, unsigned NewShiftOpcode,
- SelectionDAG &DAG) const override;
-
- unsigned preferedOpcodeForCmpEqPiecesOfOperand(
- EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
- const APInt &ShiftOrRotateAmt,
- const std::optional<APInt> &AndMask) const override;
-
- bool preferScalarizeSplat(SDNode *N) const override;
-
- CondMergingParams
- getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
- const Value *Rhs) const override;
-
- bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override;
-
- bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
-
- bool
- shouldTransformSignedTruncationCheck(EVT XVT,
- unsigned KeptBits) const override {
- // For vectors, we don't have a preference..
- if (XVT.isVector())
- return false;
-
- auto VTIsOk = [](EVT VT) -> bool {
- return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
- VT == MVT::i64;
- };
-
- // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
- // XVT will be larger than KeptBitsVT.
- MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
- return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
- }
-
- ShiftLegalizationStrategy
- preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
- unsigned ExpansionFactor) const override;
-
- bool shouldSplatInsEltVarIndex(EVT VT) const override;
-
- bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override {
- // Converting to sat variants holds little benefit on X86 as we will just
- // need to saturate the value back using fp arithmatic.
- return Op != ISD::FP_TO_UINT_SAT && isOperationLegalOrCustom(Op, VT);
- }
-
- bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
- return VT.isScalarInteger();
- }
-
- /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
- MVT hasFastEqualityCompare(unsigned NumBits) const override;
-
- /// Return the value type to use for ISD::SETCC.
- EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
- EVT VT) const override;
-
- bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
- const APInt &DemandedElts,
- TargetLoweringOpt &TLO) const override;
-
- /// Determine which of the bits specified in Mask are known to be either
- /// zero or one and return them in the KnownZero/KnownOne bitsets.
- void computeKnownBitsForTargetNode(const SDValue Op,
- KnownBits &Known,
- const APInt &DemandedElts,
- const SelectionDAG &DAG,
- unsigned Depth = 0) const override;
-
- /// Determine the number of bits in the operation that are sign bits.
- unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
- const APInt &DemandedElts,
- const SelectionDAG &DAG,
- unsigned Depth) const override;
-
- bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op,
- const APInt &DemandedElts,
- APInt &KnownUndef,
- APInt &KnownZero,
- TargetLoweringOpt &TLO,
- unsigned Depth) const override;
-
- bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,
- const APInt &DemandedElts,
- unsigned MaskIndex,
- TargetLoweringOpt &TLO,
- unsigned Depth) const;
+ auto VTIsOk = [](EVT VT) -> bool {
+ return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
+ VT == MVT::i64;
+ };
+
+ // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
+ // XVT will be larger than KeptBitsVT.
+ MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
+ return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
+ }
+
+ ShiftLegalizationStrategy
+ preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
+ unsigned ExpansionFactor) const override;
- bool SimplifyDemandedBitsForTargetNode(SDValue Op,
- const APInt &DemandedBits,
+ bool shouldSplatInsEltVarIndex(EVT VT) const override;
+
+ bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override {
+ // Converting to sat variants holds little benefit on X86 as we will just
+ // need to saturate the value back using fp arithmatic.
+ return Op != ISD::FP_TO_UINT_SAT && isOperationLegalOrCustom(Op, VT);
+ }
+
+ bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+ return VT.isScalarInteger();
+ }
+
+ /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
+ MVT hasFastEqualityCompare(unsigned NumBits) const override;
+
+ /// Return the value type to use for ISD::SETCC.
+ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+ EVT VT) const override;
+
+ bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
+ const APInt &DemandedElts,
+ TargetLoweringOpt &TLO) const override;
+
+ /// Determine which of the bits specified in Mask are known to be either
+ /// zero or one and return them in the KnownZero/KnownOne bitsets.
+ void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
+
+ /// Determine the number of bits in the operation that are sign bits.
+ unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
const APInt &DemandedElts,
- KnownBits &Known,
- TargetLoweringOpt &TLO,
+ const SelectionDAG &DAG,
unsigned Depth) const override;
- SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
- SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
- SelectionDAG &DAG, unsigned Depth) const override;
+ bool SimplifyDemandedVectorEltsForTargetNode(
+ SDValue Op, const APInt &DemandedElts, APInt &KnownUndef,
+ APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override;
- bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(
- SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
- bool PoisonOnly, unsigned Depth) const override;
+ bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,
+ const APInt &DemandedElts,
+ unsigned MaskIndex,
+ TargetLoweringOpt &TLO,
+ unsigned Depth) const;
- bool canCreateUndefOrPoisonForTargetNode(
- SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
- bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override;
+ bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits,
+ const APInt &DemandedElts,
+ KnownBits &Known,
+ TargetLoweringOpt &TLO,
+ unsigned Depth) const override;
- bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts,
- APInt &UndefElts, const SelectionDAG &DAG,
- unsigned Depth) const override;
+ SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ SelectionDAG &DAG, unsigned Depth) const override;
- bool isTargetCanonicalConstantNode(SDValue Op) const override {
- // Peek through bitcasts/extracts/inserts to see if we have a vector
- // load/broadcast from memory.
- while (Op.getOpcode() == ISD::BITCAST ||
- Op.getOpcode() == ISD::EXTRACT_SUBVECTOR ||
- (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Op.getOperand(0).isUndef()))
- Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0);
+ bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(
+ SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+ bool PoisonOnly, unsigned Depth) const override;
- return Op.getOpcode() == X86ISD::VBROADCAST_LOAD ||
- Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
- (Op.getOpcode() == ISD::LOAD &&
- getTargetConstantFromLoad(cast<LoadSDNode>(Op))) ||
- TargetLowering::isTargetCanonicalConstantNode(Op);
- }
+ bool canCreateUndefOrPoisonForTargetNode(SDValue Op,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ bool PoisonOnly, bool ConsiderFlags,
+ unsigned Depth) const override;
+
+ bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts,
+ APInt &UndefElts, const SelectionDAG &DAG,
+ unsigned Depth) const override;
+
+ bool isTargetCanonicalConstantNode(SDValue Op) const override {
+ // Peek through bitcasts/extracts/inserts to see if we have a vector
+ // load/broadcast from memory.
+ while (
+ Op.getOpcode() == ISD::BITCAST ||
+ Op.getOpcode() == ISD::EXTRACT_SUBVECTOR ||
+ (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef()))
+ Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0);
+
+ return Op.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+ Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
+ (Op.getOpcode() == ISD::LOAD &&
+ getTargetConstantFromLoad(cast<LoadSDNode>(Op))) ||
+ TargetLowering::isTargetCanonicalConstantNode(Op);
+ }
- bool isTargetCanonicalSelect(SDNode *N) const override;
+ bool isTargetCanonicalSelect(SDNode *N) const override;
- const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
+ const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
- SDValue unwrapAddress(SDValue N) const override;
+ SDValue unwrapAddress(SDValue N) const override;
- SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+ SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
- ConstraintType getConstraintType(StringRef Constraint) const override;
+ ConstraintType getConstraintType(StringRef Constraint) const override;
- /// Examine constraint string and operand type and determine a weight value.
- /// The operand object must already have been set up with the operand type.
- ConstraintWeight
- getSingleConstraintMatchWeight(AsmOperandInfo &Info,
- const char *Constraint) const override;
+ /// Examine constraint string and operand type and determine a weight value.
+ /// The operand object must already have been set up with the operand type.
+ ConstraintWeight
+ getSingleConstraintMatchWeight(AsmOperandInfo &Info,
+ const char *Constraint) const override;
- const char *LowerXConstraint(EVT ConstraintVT) const override;
+ const char *LowerXConstraint(EVT ConstraintVT) const override;
- /// Lower the specified operand into the Ops vector. If it is invalid, don't
- /// add anything to Ops. If hasMemory is true it means one of the asm
- /// constraint of the inline asm instruction being processed is 'm'.
- void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint,
- std::vector<SDValue> &Ops,
+ /// Lower the specified operand into the Ops vector. If it is invalid, don't
+ /// add anything to Ops. If hasMemory is true it means one of the asm
+ /// constraint of the inline asm instruction being processed is 'm'.
+ void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint,
+ std::vector<SDValue> &Ops,
+ SelectionDAG &DAG) const override;
+
+ InlineAsm::ConstraintCode
+ getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+ if (ConstraintCode == "v")
+ return InlineAsm::ConstraintCode::v;
+ return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+ }
+
+ /// Handle Lowering flag assembly outputs.
+ SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
+ const SDLoc &DL,
+ const AsmOperandInfo &Constraint,
SelectionDAG &DAG) const override;
- InlineAsm::ConstraintCode
- getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
- if (ConstraintCode == "v")
- return InlineAsm::ConstraintCode::v;
- return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
- }
-
- /// Handle Lowering flag assembly outputs.
- SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
- const SDLoc &DL,
- const AsmOperandInfo &Constraint,
- SelectionDAG &DAG) const override;
-
- /// Given a physical register constraint
- /// (e.g. {edx}), return the register number and the register class for the
- /// register. This should only be used for C_Register constraints. On
- /// error, this returns a register number of 0.
- std::pair<unsigned, const TargetRegisterClass *>
- getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
- StringRef Constraint, MVT VT) const override;
-
- /// Return true if the addressing mode represented
- /// by AM is legal for this target, for a load/store of the specified type.
- bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
- Type *Ty, unsigned AS,
- Instruction *I = nullptr) const override;
-
- bool addressingModeSupportsTLS(const GlobalValue &GV) const override;
-
- /// Return true if the specified immediate is legal
- /// icmp immediate, that is the target has icmp instructions which can
- /// compare a register against the immediate without having to materialize
- /// the immediate into a register.
- bool isLegalICmpImmediate(int64_t Imm) const override;
-
- /// Return true if the specified immediate is legal
- /// add immediate, that is the target has add instructions which can
- /// add a register and the immediate without having to materialize
- /// the immediate into a register.
- bool isLegalAddImmediate(int64_t Imm) const override;
-
- bool isLegalStoreImmediate(int64_t Imm) const override;
-
- /// Add x86-specific opcodes to the default list.
- bool isBinOp(unsigned Opcode) const override;
-
- /// Returns true if the opcode is a commutative binary operation.
- bool isCommutativeBinOp(unsigned Opcode) const override;
-
- /// Return true if it's free to truncate a value of
- /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
- /// register EAX to i16 by referencing its sub-register AX.
- bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
- bool isTruncateFree(EVT VT1, EVT VT2) const override;
-
- bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
-
- /// Return true if any actual instruction that defines a
- /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
- /// register. This does not necessarily include registers defined in
- /// unknown ways, such as incoming arguments, or copies from unknown
- /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
- /// does not necessarily apply to truncate instructions. e.g. on x86-64,
- /// all instructions that define 32-bit values implicit zero-extend the
- /// result out to 64 bits.
- bool isZExtFree(Type *Ty1, Type *Ty2) const override;
- bool isZExtFree(EVT VT1, EVT VT2) const override;
- bool isZExtFree(SDValue Val, EVT VT2) const override;
-
- bool shouldConvertPhiType(Type *From, Type *To) const override;
-
- /// Return true if folding a vector load into ExtVal (a sign, zero, or any
- /// extend node) is profitable.
- bool isVectorLoadExtDesirable(SDValue) const override;
-
- /// Return true if an FMA operation is faster than a pair of fmul and fadd
- /// instructions. fmuladd intrinsics will be expanded to FMAs when this
- /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
- bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ /// Given a physical register constraint
+ /// (e.g. {edx}), return the register number and the register class for the
+ /// register. This should only be used for C_Register constraints. On
+ /// error, this returns a register number of 0.
+ std::pair<unsigned, const TargetRegisterClass *>
+ getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+ StringRef Constraint, MVT VT) const override;
+
+ /// Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+ unsigned AS,
+ Instruction *I = nullptr) const override;
+
+ bool addressingModeSupportsTLS(const GlobalValue &GV) const override;
+
+ /// Return true if the specified immediate is legal
+ /// icmp immediate, that is the target has icmp instructions which can
+ /// compare a register against the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalICmpImmediate(int64_t Imm) const override;
+
+ /// Return true if the specified immediate is legal
+ /// add immediate, that is the target has add instructions which can
+ /// add a register and the immediate without having to materialize
+ /// the immediate into a register.
+ bool isLegalAddImmediate(int64_t Imm) const override;
+
+ bool isLegalStoreImmediate(int64_t Imm) const override;
+
+ /// Add x86-specific opcodes to the default list.
+ bool isBinOp(unsigned Opcode) const override;
+
+ /// Returns true if the opcode is a commutative binary operation.
+ bool isCommutativeBinOp(unsigned Opcode) const override;
+
+ /// Return true if it's free to truncate a value of
+ /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+ /// register EAX to i16 by referencing its sub-register AX.
+ bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+ bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+ bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+ /// Return true if any actual instruction that defines a
+ /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+ /// register. This does not necessarily include registers defined in
+ /// unknown ways, such as incoming arguments, or copies from unknown
+ /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+ /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+ /// all instructions that define 32-bit values implicit zero-extend the
+ /// result out to 64 bits.
+ bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+ bool isZExtFree(EVT VT1, EVT VT2) const override;
+ bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+ bool shouldConvertPhiType(Type *From, Type *To) const override;
+
+ /// Return true if folding a vector load into ExtVal (a sign, zero, or any
+ /// extend node) is profitable.
+ bool isVectorLoadExtDesirable(SDValue) const override;
+
+ /// Return true if an FMA operation is faster than a pair of fmul and fadd
+ /// instructions. fmuladd intrinsics will be expanded to FMAs when this
+ /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
+ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const override;
+
+ /// Return true if it's profitable to narrow operations of type SrcVT to
+ /// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not
+ /// from i32 to i16.
+ bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
+
+ bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
+ unsigned SelectOpcode, SDValue X,
+ SDValue Y) const override;
+
+ /// Given an intrinsic, checks if on the target the intrinsic will need to map
+ /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
+ /// true and stores the intrinsic information into the IntrinsicInfo that was
+ /// passed to the function.
+ bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+ MachineFunction &MF,
+ unsigned Intrinsic) const override;
+
+ /// Returns true if the target can instruction select the
+ /// specified FP immediate natively. If false, the legalizer will
+ /// materialize the FP immediate as a load from a constant pool.
+ bool isFPImmLegal(const APFloat &Imm, EVT VT,
+ bool ForCodeSize) const override;
+
+ /// Targets can use this to indicate that they only support *some*
+ /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
+ /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
+ /// be legal.
+ bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
+
+ /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
+ /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
+ /// constant pool entry.
+ bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
+
+ /// Returns true if lowering to a jump table is allowed.
+ bool areJTsAllowed(const Function *Fn) const override;
+
+ MVT getPreferredSwitchConditionType(LLVMContext &Context,
+ EVT ConditionVT) const override;
+
+ /// If true, then instruction selection should
+ /// seek to shrink the FP constant of the specified type to a smaller type
+ /// in order to save space and / or reduce runtime.
+ bool ShouldShrinkFPConstant(EVT VT) const override;
+
+ /// Return true if we believe it is correct and profitable to reduce the
+ /// load node to a smaller type.
+ bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
+ std::optional<unsigned> ByteOffset) const override;
+
+ /// Return true if the specified scalar FP type is computed in an SSE
+ /// register, not on the X87 floating point stack.
+ bool isScalarFPTypeInSSEReg(EVT VT) const;
+
+ /// Returns true if it is beneficial to convert a load of a constant
+ /// to just the constant itself.
+ bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+ Type *Ty) const override;
+
+ bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
+
+ bool convertSelectOfConstantsToMath(EVT VT) const override;
+
+ bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+ SDValue C) const override;
+
+ /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+ /// with this index.
+ bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+ unsigned Index) const override;
+
+ /// Scalar ops always have equal or better analysis/performance/power than
+ /// the vector equivalent, so this always makes sense if the scalar op is
+ /// supported.
+ bool shouldScalarizeBinop(SDValue) const override;
+
+ /// Extract of a scalar FP value from index 0 of a vector is free.
+ bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
+ EVT EltVT = VT.getScalarType();
+ return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
+ }
+
+ /// Overflow nodes should get combined/lowered to optimal instructions
+ /// (they should allow eliminating explicit compares by getting flags from
+ /// math ops).
+ bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+ bool MathUsed) const override;
+
+ bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem,
+ unsigned AddrSpace) const override {
+ // If we can replace more than 2 scalar stores, there will be a reduction
+ // in instructions even after we add a vector constant load.
+ return IsZero || NumElem > 2;
+ }
+
+ bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const override;
+
+ Register getRegisterByName(const char *RegName, LLT VT,
+ const MachineFunction &MF) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception address on entry to an EH pad.
+ Register
+ getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+ /// If a physical register, this returns the register that receives the
+ /// exception typeid on entry to a landing pad.
+ Register
+ getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
+ bool needsFixedCatchObjects() const override;
+
+ /// This method returns a target specific FastISel object,
+ /// or null if the target does not support "fast" ISel.
+ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo) const override;
+
+ /// If the target has a standard location for the stack protector cookie,
+ /// returns the address of that location. Otherwise, returns nullptr.
+ Value *getIRStackGuard(IRBuilderBase &IRB) const override;
+
+ bool useLoadStackGuardNode(const Module &M) const override;
+ bool useStackGuardXorFP() const override;
+ void insertSSPDeclarations(Module &M) const override;
+ SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
+ const SDLoc &DL) const override;
+
+ /// Return true if the target stores SafeStack pointer at a fixed offset in
+ /// some non-standard address space, and populates the address space and
+ /// offset as appropriate.
+ Value *getSafeStackPointerLocation(IRBuilderBase &IRB) const override;
+
+ std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
+ SDValue Chain, SDValue Pointer,
+ MachinePointerInfo PtrInfo,
+ Align Alignment,
+ SelectionDAG &DAG) const;
+
+ /// Customize the preferred legalization strategy for certain types.
+ LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
+
+ bool softPromoteHalfType() const override { return true; }
+
+ MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
EVT VT) const override;
- /// Return true if it's profitable to narrow operations of type SrcVT to
- /// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not
- /// from i32 to i16.
- bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
-
- bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
- unsigned SelectOpcode, SDValue X,
- SDValue Y) const override;
-
- /// Given an intrinsic, checks if on the target the intrinsic will need to map
- /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
- /// true and stores the intrinsic information into the IntrinsicInfo that was
- /// passed to the function.
- bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
- MachineFunction &MF,
- unsigned Intrinsic) const override;
-
- /// Returns true if the target can instruction select the
- /// specified FP immediate natively. If false, the legalizer will
- /// materialize the FP immediate as a load from a constant pool.
- bool isFPImmLegal(const APFloat &Imm, EVT VT,
- bool ForCodeSize) const override;
-
- /// Targets can use this to indicate that they only support *some*
- /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
- /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
- /// be legal.
- bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
-
- /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
- /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
- /// constant pool entry.
- bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
-
- /// Returns true if lowering to a jump table is allowed.
- bool areJTsAllowed(const Function *Fn) const override;
-
- MVT getPreferredSwitchConditionType(LLVMContext &Context,
- EVT ConditionVT) const override;
-
- /// If true, then instruction selection should
- /// seek to shrink the FP constant of the specified type to a smaller type
- /// in order to save space and / or reduce runtime.
- bool ShouldShrinkFPConstant(EVT VT) const override;
-
- /// Return true if we believe it is correct and profitable to reduce the
- /// load node to a smaller type.
- bool
- shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
- std::optional<unsigned> ByteOffset) const override;
-
- /// Return true if the specified scalar FP type is computed in an SSE
- /// register, not on the X87 floating point stack.
- bool isScalarFPTypeInSSEReg(EVT VT) const;
-
- /// Returns true if it is beneficial to convert a load of a constant
- /// to just the constant itself.
- bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
- Type *Ty) const override;
-
- bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
-
- bool convertSelectOfConstantsToMath(EVT VT) const override;
-
- bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
- SDValue C) const override;
-
- /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
- /// with this index.
- bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
- unsigned Index) const override;
-
- /// Scalar ops always have equal or better analysis/performance/power than
- /// the vector equivalent, so this always makes sense if the scalar op is
- /// supported.
- bool shouldScalarizeBinop(SDValue) const override;
-
- /// Extract of a scalar FP value from index 0 of a vector is free.
- bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
- EVT EltVT = VT.getScalarType();
- return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
- }
+ unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const override;
- /// Overflow nodes should get combined/lowered to optimal instructions
- /// (they should allow eliminating explicit compares by getting flags from
- /// math ops).
- bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
- bool MathUsed) const override;
+ unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC, EVT VT,
+ EVT &IntermediateVT,
+ unsigned &NumIntermediates,
+ MVT &RegisterVT) const override;
- bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem,
- unsigned AddrSpace) const override {
- // If we can replace more than 2 scalar stores, there will be a reduction
- // in instructions even after we add a vector constant load.
- return IsZero || NumElem > 2;
- }
-
- bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
- const SelectionDAG &DAG,
- const MachineMemOperand &MMO) const override;
-
- Register getRegisterByName(const char* RegName, LLT VT,
- const MachineFunction &MF) const override;
-
- /// If a physical register, this returns the register that receives the
- /// exception address on entry to an EH pad.
- Register
- getExceptionPointerRegister(const Constant *PersonalityFn) const override;
-
- /// If a physical register, this returns the register that receives the
- /// exception typeid on entry to a landing pad.
- Register
- getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
-
- bool needsFixedCatchObjects() const override;
-
- /// This method returns a target specific FastISel object,
- /// or null if the target does not support "fast" ISel.
- FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
- const TargetLibraryInfo *libInfo) const override;
-
- /// If the target has a standard location for the stack protector cookie,
- /// returns the address of that location. Otherwise, returns nullptr.
- Value *getIRStackGuard(IRBuilderBase &IRB) const override;
-
- bool useLoadStackGuardNode(const Module &M) const override;
- bool useStackGuardXorFP() const override;
- void insertSSPDeclarations(Module &M) const override;
- SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
- const SDLoc &DL) const override;
+ bool functionArgumentNeedsConsecutiveRegisters(
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+ const DataLayout &DL) const override;
+ bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
- /// Return true if the target stores SafeStack pointer at a fixed offset in
- /// some non-standard address space, and populates the address space and
- /// offset as appropriate.
- Value *getSafeStackPointerLocation(IRBuilderBase &IRB) const override;
+ bool supportSwiftError() const override;
- std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
- SDValue Chain, SDValue Pointer,
- MachinePointerInfo PtrInfo,
- Align Alignment,
- SelectionDAG &DAG) const;
-
- /// Customize the preferred legalization strategy for certain types.
- LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
+ bool supportKCFIBundles() const override { return true; }
- bool softPromoteHalfType() const override { return true; }
-
- MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
- EVT VT) const override;
+ MachineInstr *EmitKCFICheck(MachineBasicBlock &MBB,
+ MachineBasicBlock::instr_iterator &MBBI,
+ const TargetInstrInfo *TII) const override;
- unsigned getNumRegistersForCallingConv(LLVMContext &Context,
- CallingConv::ID CC,
- EVT VT) const override;
+ bool hasStackProbeSymbol(const MachineFunction &MF) const override;
+ bool hasInlineStackProbe(const MachineFunction &MF) const override;
+ StringRef getStackProbeSymbolName(const MachineFunction &MF) const override;
- unsigned getVectorTypeBreakdownForCallingConv(
- LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
- unsigned &NumIntermediates, MVT &RegisterVT) const override;
+ unsigned getStackProbeSize(const MachineFunction &MF) const;
- bool functionArgumentNeedsConsecutiveRegisters(
- Type *Ty, CallingConv::ID CallConv, bool isVarArg,
- const DataLayout &DL) const override;
+ bool hasVectorBlend() const override { return true; }
- bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
-
- bool supportSwiftError() const override;
-
- bool supportKCFIBundles() const override { return true; }
-
- MachineInstr *EmitKCFICheck(MachineBasicBlock &MBB,
- MachineBasicBlock::instr_iterator &MBBI,
- const TargetInstrInfo *TII) const override;
-
- bool hasStackProbeSymbol(const MachineFunction &MF) const override;
- bool hasInlineStackProbe(const MachineFunction &MF) const override;
- StringRef getStackProbeSymbolName(const MachineFunction &MF) const override;
-
- unsigned getStackProbeSize(const MachineFunction &MF) const;
-
- bool hasVectorBlend() const override { return true; }
-
- unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+ unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
- bool isInlineAsmTargetBranch(const SmallVectorImpl<StringRef> &AsmStrs,
- unsigned OpNo) const override;
+ bool isInlineAsmTargetBranch(const SmallVectorImpl<StringRef> &AsmStrs,
+ unsigned OpNo) const override;
- SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
- MachineMemOperand *MMO, SDValue &NewLoad,
- SDValue Ptr, SDValue PassThru,
- SDValue Mask) const override;
- SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
- MachineMemOperand *MMO, SDValue Ptr, SDValue Val,
- SDValue Mask) const override;
+ SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr,
+ SDValue PassThru, SDValue Mask) const override;
+ SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ MachineMemOperand *MMO, SDValue Ptr, SDValue Val,
+ SDValue Mask) const override;
- /// Lower interleaved load(s) into target specific
- /// instructions/intrinsics.
- bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
- ArrayRef<ShuffleVectorInst *> Shuffles,
- ArrayRef<unsigned> Indices, unsigned Factor,
- const APInt &GapMask) const override;
+ /// Lower interleaved load(s) into target specific
+ /// instructions/intrinsics.
+ bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
+ ArrayRef<ShuffleVectorInst *> Shuffles,
+ ArrayRef<unsigned> Indices, unsigned Factor,
+ const APInt &GapMask) const override;
- /// Lower interleaved store(s) into target specific
- /// instructions/intrinsics.
- bool lowerInterleavedStore(Instruction *Store, Value *Mask,
- ShuffleVectorInst *SVI, unsigned Factor,
- const APInt &GapMask) const override;
+ /// Lower interleaved store(s) into target specific
+ /// instructions/intrinsics.
+ bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+ ShuffleVectorInst *SVI, unsigned Factor,
+ const APInt &GapMask) const override;
- SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
- int JTI, SelectionDAG &DAG) const override;
+ SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
+ int JTI, SelectionDAG &DAG) const override;
- Align getPrefLoopAlignment(MachineLoop *ML) const override;
+ Align getPrefLoopAlignment(MachineLoop *ML) const override;
- EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override {
- if (VT == MVT::f80)
- return EVT::getIntegerVT(Context, 96);
- return TargetLoweringBase::getTypeToTransformTo(Context, VT);
- }
+ EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override {
+ if (VT == MVT::f80)
+ return EVT::getIntegerVT(Context, 96);
+ return TargetLoweringBase::getTypeToTransformTo(Context, VT);
+ }
- protected:
- std::pair<const TargetRegisterClass *, uint8_t>
- findRepresentativeClass(const TargetRegisterInfo *TRI,
- MVT VT) const override;
+protected:
+ std::pair<const TargetRegisterClass *, uint8_t>
+ findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override;
- private:
- /// Keep a reference to the X86Subtarget around so that we can
- /// make the right decision when generating code for different targets.
- const X86Subtarget &Subtarget;
+private:
+ /// Keep a reference to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget &Subtarget;
- /// A list of legal FP immediates.
- std::vector<APFloat> LegalFPImmediates;
+ /// A list of legal FP immediates.
+ std::vector<APFloat> LegalFPImmediates;
- /// Indicate that this x86 target can instruction
- /// select the specified FP immediate natively.
- void addLegalFPImmediate(const APFloat& Imm) {
- LegalFPImmediates.push_back(Imm);
- }
+ /// Indicate that this x86 target can instruction
+ /// select the specified FP immediate natively.
+ void addLegalFPImmediate(const APFloat &Imm) {
+ LegalFPImmediates.push_back(Imm);
+ }
- SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
- CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- const SDLoc &dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals,
- uint32_t *RegMask) const;
- SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
- const SmallVectorImpl<ISD::InputArg> &ArgInfo,
- const SDLoc &dl, SelectionDAG &DAG,
- const CCValAssign &VA, MachineFrameInfo &MFI,
- unsigned i) const;
- SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
- const SDLoc &dl, SelectionDAG &DAG,
- const CCValAssign &VA,
- ISD::ArgFlagsTy Flags, bool isByval) const;
-
- // Call lowering helpers.
-
- /// Check whether the call is eligible for tail call optimization. Targets
- /// that want to do tail call optimization should implement this function.
- bool IsEligibleForTailCallOptimization(
- TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
- SmallVectorImpl<CCValAssign> &ArgLocs, bool IsCalleePopSRet) const;
- SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
- SDValue Chain, bool IsTailCall,
- bool Is64Bit, int FPDiff,
- const SDLoc &dl) const;
-
- unsigned GetAlignedArgumentStackSize(unsigned StackSize,
- SelectionDAG &DAG) const;
-
- unsigned getAddressSpace() const;
-
- SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned,
- SDValue &Chain) const;
- SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const;
-
- SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-
- unsigned getGlobalWrapperKind(const GlobalValue *GV,
- const unsigned char OpFlags) const;
- SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
-
- /// Creates target global address or external symbol nodes for calls or
- /// other uses.
- SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall,
- bool *IsImpCall) const;
-
- SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG,
- SDValue &Chain) const;
- SDValue LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
-
- SDValue
- LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins,
- const SDLoc &dl, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const override;
- SDValue LowerCall(CallLoweringInfo &CLI,
- SmallVectorImpl<SDValue> &InVals) const override;
-
- SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SDLoc &dl, SelectionDAG &DAG) const override;
-
- bool supportSplitCSR(MachineFunction *MF) const override {
- return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
- MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
- }
- void initializeSplitCSR(MachineBasicBlock *Entry) const override;
- void insertCopiesSplitCSR(
+ SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
+ CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals,
+ uint32_t *RegMask) const;
+ SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
+ const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const CCValAssign &VA, MachineFrameInfo &MFI,
+ unsigned i) const;
+ SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const CCValAssign &VA, ISD::ArgFlagsTy Flags,
+ bool isByval) const;
+
+ // Call lowering helpers.
+
+ /// Check whether the call is eligible for tail call optimization. Targets
+ /// that want to do tail call optimization should implement this function.
+ bool IsEligibleForTailCallOptimization(TargetLowering::CallLoweringInfo &CLI,
+ CCState &CCInfo,
+ SmallVectorImpl<CCValAssign> &ArgLocs,
+ bool IsCalleePopSRet) const;
+ SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+ SDValue Chain, bool IsTailCall, bool Is64Bit,
+ int FPDiff, const SDLoc &dl) const;
+
+ unsigned GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG &DAG) const;
+
+ unsigned getAddressSpace() const;
+
+ SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned,
+ SDValue &Chain) const;
+ SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const;
+
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+
+ unsigned getGlobalWrapperKind(const GlobalValue *GV,
+ const unsigned char OpFlags) const;
+ SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+
+ /// Creates target global address or external symbol nodes for calls or
+ /// other uses.
+ SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall,
+ bool *IsImpCall) const;
+
+ SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG,
+ SDValue &Chain) const;
+ SDValue LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SmallVectorImpl<SDValue> &InVals) const override;
+ SDValue LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const override;
+
+ SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+ SelectionDAG &DAG) const override;
+
+ bool supportSplitCSR(MachineFunction *MF) const override {
+ return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
+ }
+ void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+ void insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
- bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+ bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
- bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+ bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
- EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
- ISD::NodeType ExtendKind) const override;
+ EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType ExtendKind) const override;
- bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
- bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- LLVMContext &Context,
- const Type *RetTy) const override;
+ bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ LLVMContext &Context, const Type *RetTy) const override;
- const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
- ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
+ const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+ ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
- TargetLoweringBase::AtomicExpansionKind
- shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
- TargetLoweringBase::AtomicExpansionKind
- shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
- TargetLoweringBase::AtomicExpansionKind
- shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
- TargetLoweringBase::AtomicExpansionKind
- shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
- void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
- void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+ TargetLoweringBase::AtomicExpansionKind
+ shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+ void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
+ void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
- LoadInst *
- lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+ LoadInst *lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
- bool needsCmpXchgNb(Type *MemType) const;
+ bool needsCmpXchgNb(Type *MemType) const;
- void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
- MachineBasicBlock *DispatchBB, int FI) const;
+ void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
+ MachineBasicBlock *DispatchBB, int FI) const;
- // Utility function to emit the low-level va_arg code for X86-64.
- MachineBasicBlock *
- EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
+ // Utility function to emit the low-level va_arg code for X86-64.
+ MachineBasicBlock *EmitVAARGWithCustomInserter(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ /// Utility function to emit the xmm reg save portion of va_start.
+ MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
+ MachineInstr &MI2,
+ MachineBasicBlock *BB) const;
- /// Utility function to emit the xmm reg save portion of va_start.
- MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
- MachineInstr &MI2,
- MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
+ MachineBasicBlock *BB) const;
- MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
+ MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
MachineBasicBlock *BB) const;
- MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
- MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
- MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
- MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
- MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
- MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
- MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
- MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
- MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
- MachineBasicBlock *BB) const;
+ MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
- MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
- MachineBasicBlock *MBB) const;
+ void emitSetJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const;
- void emitSetJmpShadowStackFix(MachineInstr &MI,
- MachineBasicBlock *MBB) const;
+ MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
- MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
- MachineBasicBlock *MBB) const;
+ MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
- MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
- MachineBasicBlock *MBB) const;
+ MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ MachineBasicBlock *emitPatchableEventCall(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
+ /// Emit flags for the given setcc condition and operands. Also returns the
+ /// corresponding X86 condition code constant in X86CC.
+ SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SDValue &X86CC) const;
+
+ bool optimizeFMulOrFDivAsShiftAddBitcast(SDNode *N, SDValue FPConst,
+ SDValue IntPow2) const override;
+
+ /// Check if replacement of SQRT with RSQRT should be disabled.
+ bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override;
+
+ /// Use rsqrt* to speed up sqrt calculations.
+ SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps, bool &UseOneConstNR,
+ bool Reciprocal) const override;
+
+ /// Use rcp* to speed up fdiv calculations.
+ SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
+ int &RefinementSteps) const override;
+
+ /// Reassociate floating point divisions into multiply by reciprocal.
+ unsigned combineRepeatedFPDivisors() const override;
+
+ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const override;
+
+ SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
+ SDValue V2) const;
+};
+
+namespace X86 {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+ const TargetLibraryInfo *libInfo);
+} // end namespace X86
+
+// X86 specific Gather/Scatter nodes.
+// The class has the same order of operands as MaskedGatherScatterSDNode for
+// convenience.
+class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode {
+public:
+ // This is a intended as a utility and should never be directly created.
+ X86MaskedGatherScatterSDNode() = delete;
+ ~X86MaskedGatherScatterSDNode() = delete;
+
+ const SDValue &getBasePtr() const { return getOperand(3); }
+ const SDValue &getIndex() const { return getOperand(4); }
+ const SDValue &getMask() const { return getOperand(2); }
+ const SDValue &getScale() const { return getOperand(5); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::MGATHER ||
+ N->getOpcode() == X86ISD::MSCATTER;
+ }
+};
+
+class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
+public:
+ const SDValue &getPassThru() const { return getOperand(1); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::MGATHER;
+ }
+};
+
+class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
+public:
+ const SDValue &getValue() const { return getOperand(1); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::MSCATTER;
+ }
+};
+
+/// Generate unpacklo/unpackhi shuffle mask.
+void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+ bool Unary);
- MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
- MachineBasicBlock *MBB) const;
-
- MachineBasicBlock *emitPatchableEventCall(MachineInstr &MI,
- MachineBasicBlock *MBB) const;
-
- /// Emit flags for the given setcc condition and operands. Also returns the
- /// corresponding X86 condition code constant in X86CC.
- SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
- const SDLoc &dl, SelectionDAG &DAG,
- SDValue &X86CC) const;
-
- bool optimizeFMulOrFDivAsShiftAddBitcast(SDNode *N, SDValue FPConst,
- SDValue IntPow2) const override;
-
- /// Check if replacement of SQRT with RSQRT should be disabled.
- bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override;
-
- /// Use rsqrt* to speed up sqrt calculations.
- SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
- int &RefinementSteps, bool &UseOneConstNR,
- bool Reciprocal) const override;
-
- /// Use rcp* to speed up fdiv calculations.
- SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
- int &RefinementSteps) const override;
-
- /// Reassociate floating point divisions into multiply by reciprocal.
- unsigned combineRepeatedFPDivisors() const override;
-
- SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
- SmallVectorImpl<SDNode *> &Created) const override;
-
- SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
- SDValue V2) const;
- };
-
- namespace X86 {
- FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
- const TargetLibraryInfo *libInfo);
- } // end namespace X86
-
- // X86 specific Gather/Scatter nodes.
- // The class has the same order of operands as MaskedGatherScatterSDNode for
- // convenience.
- class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode {
- public:
- // This is a intended as a utility and should never be directly created.
- X86MaskedGatherScatterSDNode() = delete;
- ~X86MaskedGatherScatterSDNode() = delete;
-
- const SDValue &getBasePtr() const { return getOperand(3); }
- const SDValue &getIndex() const { return getOperand(4); }
- const SDValue &getMask() const { return getOperand(2); }
- const SDValue &getScale() const { return getOperand(5); }
-
- static bool classof(const SDNode *N) {
- return N->getOpcode() == X86ISD::MGATHER ||
- N->getOpcode() == X86ISD::MSCATTER;
- }
- };
-
- class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
- public:
- const SDValue &getPassThru() const { return getOperand(1); }
-
- static bool classof(const SDNode *N) {
- return N->getOpcode() == X86ISD::MGATHER;
- }
- };
-
- class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
- public:
- const SDValue &getValue() const { return getOperand(1); }
-
- static bool classof(const SDNode *N) {
- return N->getOpcode() == X86ISD::MSCATTER;
- }
- };
-
- /// Generate unpacklo/unpackhi shuffle mask.
- void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
- bool Unary);
-
- /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
- /// imposed by AVX and specific to the unary pattern. Example:
- /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
- /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
- void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo);
+/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
+/// imposed by AVX and specific to the unary pattern. Example:
+/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
+/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
+void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo);
} // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 765db86ffafb3..d73c3aa0e1e82 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -690,8 +690,7 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
.addImm(31));
} else {
// Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax)
- recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR)
- .addReg(TmpGPR));
+ recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR));
}
// Broadcast to TmpX (vector mask)
@@ -848,7 +847,8 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
.setMIFlags(MachineInstr::MIFlag::NoMerge));
}
- assert(FirstInstr && LastInstr && "Expected at least one expanded instruction");
+ assert(FirstInstr && LastInstr &&
+ "Expected at least one expanded instruction");
auto BundleEnd = LastInstr->getIterator();
finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd));
@@ -916,25 +916,28 @@ bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const {
/// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time)
/// These internal pseudos receive a pre-materialized condition byte from the
-/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization.
+/// custom inserter, avoiding EFLAGS corruption issues during i64 type
+/// legalization.
bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
MachineBasicBlock *MBB = MI.getParent();
DebugLoc DL = MI.getDebugLoc();
// CTSELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask),
// (ins src1, src2, cond_byte)
- // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent
+ // Note: cond_byte is pre-materialized by custom inserter, not
+ // EFLAGS-dependent
Register DstReg = MI.getOperand(0).getReg();
Register TmpByteReg = MI.getOperand(1).getReg();
Register TmpMaskReg = MI.getOperand(2).getReg();
Register Src1Reg = MI.getOperand(3).getReg();
Register Src2Reg = MI.getOperand(4).getReg();
- Register CondByteReg = MI.getOperand(5).getReg(); // Pre-materialized condition byte
+ Register CondByteReg =
+ MI.getOperand(5).getReg(); // Pre-materialized condition byte
// Determine instruction opcodes based on register width
unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp;
if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) {
- MovZXOp = 0; // No zero-extend needed for GR8
+ MovZXOp = 0; // No zero-extend needed for GR8
NegOp = X86::NEG8r;
MovOp = X86::MOV8rr;
AndOp = X86::AND8rr;
@@ -963,8 +966,8 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
// Step 1: Copy pre-materialized condition byte to TmpByteReg
// This allows the bundle to work with allocated temporaries
auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg)
- .addReg(CondByteReg)
- .setMIFlag(MachineInstr::MIFlag::NoMerge);
+ .addReg(CondByteReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
auto BundleStart = I1->getIterator();
// Step 2: Zero-extend condition byte to register width (0 or 1)
@@ -975,7 +978,9 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
}
// Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...)
- Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg;
+ Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr)
+ ? TmpByteReg
+ : TmpMaskReg;
BuildMI(*MBB, MI, DL, get(NegOp), MaskReg)
.addReg(MaskReg)
.setMIFlag(MachineInstr::MIFlag::NoMerge);
@@ -1003,9 +1008,9 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
// Step 8: Final result: (src1 & mask) | (src2 & ~mask)
auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg)
- .addReg(DstReg)
- .addReg(MaskReg)
- .setMIFlag(MachineInstr::MIFlag::NoMerge);
+ .addReg(DstReg)
+ .addReg(MaskReg)
+ .setMIFlag(MachineInstr::MIFlag::NoMerge);
// Bundle all generated instructions for atomic execution before removing MI
auto BundleEnd = std::next(LI->getIterator());
@@ -1014,11 +1019,12 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
finalizeBundle(*MBB, BundleStart, BundleEnd);
}
- // TODO: Optimization opportunity - The register allocator may choose callee-saved
- // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary
- // save/restore overhead. Consider constraining these to caller-saved register
- // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve
- // constant-time performance by eliminating prologue/epilogue instructions.
+ // TODO: Optimization opportunity - The register allocator may choose
+ // callee-saved registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg,
+ // causing unnecessary save/restore overhead. Consider constraining these to
+ // caller-saved register classes (e.g., GR8_AL, GR32_CallSaved) in the
+ // TableGen definitions to improve constant-time performance by eliminating
+ // prologue/epilogue instructions.
// Remove the original pseudo instruction
MI.eraseFromParent();
@@ -1306,8 +1312,7 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
return isPICBase;
}
-bool X86InstrInfo::isReMaterializableImpl(
- const MachineInstr &MI) const {
+bool X86InstrInfo::isReMaterializableImpl(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
// This function should only be called for opcodes with the ReMaterializable
@@ -1826,32 +1831,32 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
switch (MIOpc) {
default:
llvm_unreachable("Unreachable!");
- CASE_NF(SHL8ri)
- CASE_NF(SHL16ri) {
- unsigned ShAmt = MI.getOperand(2).getImm();
- MIB.addReg(0)
- .addImm(1LL << ShAmt)
- .addReg(InRegLEA, RegState::Kill)
- .addImm(0)
- .addReg(0);
- break;
- }
- CASE_NF(INC8r)
- CASE_NF(INC16r)
+ CASE_NF(SHL8ri)
+ CASE_NF(SHL16ri) {
+ unsigned ShAmt = MI.getOperand(2).getImm();
+ MIB.addReg(0)
+ .addImm(1LL << ShAmt)
+ .addReg(InRegLEA, RegState::Kill)
+ .addImm(0)
+ .addReg(0);
+ break;
+ }
+ CASE_NF(INC8r)
+ CASE_NF(INC16r)
addRegOffset(MIB, InRegLEA, true, 1);
break;
- CASE_NF(DEC8r)
- CASE_NF(DEC16r)
+ CASE_NF(DEC8r)
+ CASE_NF(DEC16r)
addRegOffset(MIB, InRegLEA, true, -1);
break;
- CASE_NF(ADD8ri)
- CASE_NF(ADD16ri)
+ CASE_NF(ADD8ri)
+ CASE_NF(ADD16ri)
case X86::ADD8ri_DB:
case X86::ADD16ri_DB:
addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
break;
- CASE_NF(ADD8rr)
- CASE_NF(ADD16rr)
+ CASE_NF(ADD8rr)
+ CASE_NF(ADD16rr)
case X86::ADD8rr_DB:
case X86::ADD16rr_DB: {
Src2 = MI.getOperand(2).getReg();
@@ -1989,128 +1994,129 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
switch (MIOpc) {
default:
llvm_unreachable("Unreachable!");
- CASE_NF(SHL64ri) {
- assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
- unsigned ShAmt = getTruncatedShiftCount(MI, 2);
- if (!isTruncatedShiftCountForLEA(ShAmt))
- return nullptr;
-
- // LEA can't handle RSP.
- if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
- Src.getReg(), &X86::GR64_NOSPRegClass))
- return nullptr;
+ CASE_NF(SHL64ri) {
+ assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt))
+ return nullptr;
- NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
- .add(Dest)
- .addReg(0)
- .addImm(1LL << ShAmt)
- .add(Src)
- .addImm(0)
- .addReg(0);
- break;
- }
- CASE_NF(SHL32ri) {
- assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
- unsigned ShAmt = getTruncatedShiftCount(MI, 2);
- if (!isTruncatedShiftCountForLEA(ShAmt))
- return nullptr;
+ // LEA can't handle RSP.
+ if (Src.getReg().isVirtual() &&
+ !MF.getRegInfo().constrainRegClass(Src.getReg(),
+ &X86::GR64_NOSPRegClass))
+ return nullptr;
- unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+ NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
+ .add(Dest)
+ .addReg(0)
+ .addImm(1LL << ShAmt)
+ .add(Src)
+ .addImm(0)
+ .addReg(0);
+ break;
+ }
+ CASE_NF(SHL32ri) {
+ assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt))
+ return nullptr;
- // LEA can't handle ESP.
- bool isKill;
- MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
- if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
- isKill, ImplicitOp, LV, LIS))
- return nullptr;
+ unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
- MachineInstrBuilder MIB =
- BuildMI(MF, MI.getDebugLoc(), get(Opc))
- .add(Dest)
- .addReg(0)
- .addImm(1LL << ShAmt)
- .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
- .addImm(0)
- .addReg(0);
- if (ImplicitOp.getReg() != 0)
- MIB.add(ImplicitOp);
- NewMI = MIB;
+ // LEA can't handle ESP.
+ bool isKill;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+ isKill, ImplicitOp, LV, LIS))
+ return nullptr;
- // Add kills if classifyLEAReg created a new register.
- if (LV && SrcReg != Src.getReg())
- LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
- break;
- }
- CASE_NF(SHL8ri)
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(0)
+ .addImm(1LL << ShAmt)
+ .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
+ .addImm(0)
+ .addReg(0);
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
+ NewMI = MIB;
+
+ // Add kills if classifyLEAReg created a new register.
+ if (LV && SrcReg != Src.getReg())
+ LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+ break;
+ }
+ CASE_NF(SHL8ri)
Is8BitOp = true;
[[fallthrough]];
- CASE_NF(SHL16ri) {
- assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
- unsigned ShAmt = getTruncatedShiftCount(MI, 2);
- if (!isTruncatedShiftCountForLEA(ShAmt))
- return nullptr;
- return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
- }
- CASE_NF(INC64r)
- CASE_NF(INC32r) {
- assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
- unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
- ? X86::LEA64r
- : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
- bool isKill;
- MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
- if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
- isKill, ImplicitOp, LV, LIS))
- return nullptr;
-
- MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
- .add(Dest)
- .addReg(SrcReg, getKillRegState(isKill));
- if (ImplicitOp.getReg() != 0)
- MIB.add(ImplicitOp);
+ CASE_NF(SHL16ri) {
+ assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (!isTruncatedShiftCountForLEA(ShAmt))
+ return nullptr;
+ return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
+ }
+ CASE_NF(INC64r)
+ CASE_NF(INC32r) {
+ assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
+ unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
+ ? X86::LEA64r
+ : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ bool isKill;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+ isKill, ImplicitOp, LV, LIS))
+ return nullptr;
- NewMI = addOffset(MIB, 1);
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(SrcReg, getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
- // Add kills if classifyLEAReg created a new register.
- if (LV && SrcReg != Src.getReg())
- LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
- break;
- }
- CASE_NF(DEC64r)
- CASE_NF(DEC32r) {
- assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
- unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
- ? X86::LEA64r
- : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+ NewMI = addOffset(MIB, 1);
- bool isKill;
- MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
- if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
- isKill, ImplicitOp, LV, LIS))
- return nullptr;
+ // Add kills if classifyLEAReg created a new register.
+ if (LV && SrcReg != Src.getReg())
+ LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+ break;
+ }
+ CASE_NF(DEC64r)
+ CASE_NF(DEC32r) {
+ assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
+ unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
+ ? X86::LEA64r
+ : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+
+ bool isKill;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+ isKill, ImplicitOp, LV, LIS))
+ return nullptr;
- MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
- .add(Dest)
- .addReg(SrcReg, getKillRegState(isKill));
- if (ImplicitOp.getReg() != 0)
- MIB.add(ImplicitOp);
+ MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(SrcReg, getKillRegState(isKill));
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
- NewMI = addOffset(MIB, -1);
+ NewMI = addOffset(MIB, -1);
- // Add kills if classifyLEAReg created a new register.
- if (LV && SrcReg != Src.getReg())
- LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
- break;
- }
- CASE_NF(DEC8r)
- CASE_NF(INC8r)
+ // Add kills if classifyLEAReg created a new register.
+ if (LV && SrcReg != Src.getReg())
+ LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+ break;
+ }
+ CASE_NF(DEC8r)
+ CASE_NF(INC8r)
Is8BitOp = true;
[[fallthrough]];
- CASE_NF(DEC16r)
- CASE_NF(INC16r)
+ CASE_NF(DEC16r)
+ CASE_NF(INC16r)
return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
- CASE_NF(ADD64rr)
- CASE_NF(ADD32rr)
+ CASE_NF(ADD64rr)
+ CASE_NF(ADD32rr)
case X86::ADD64rr_DB:
case X86::ADD32rr_DB: {
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
@@ -2161,21 +2167,21 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
NumRegOperands = 3;
break;
}
- CASE_NF(ADD8rr)
+ CASE_NF(ADD8rr)
case X86::ADD8rr_DB:
Is8BitOp = true;
[[fallthrough]];
- CASE_NF(ADD16rr)
+ CASE_NF(ADD16rr)
case X86::ADD16rr_DB:
return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
- CASE_NF(ADD64ri32)
+ CASE_NF(ADD64ri32)
case X86::ADD64ri32_DB:
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
NewMI = addOffset(
BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
MI.getOperand(2));
break;
- CASE_NF(ADD32ri)
+ CASE_NF(ADD32ri)
case X86::ADD32ri_DB: {
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
@@ -2200,62 +2206,62 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
break;
}
- CASE_NF(ADD8ri)
+ CASE_NF(ADD8ri)
case X86::ADD8ri_DB:
Is8BitOp = true;
[[fallthrough]];
- CASE_NF(ADD16ri)
+ CASE_NF(ADD16ri)
case X86::ADD16ri_DB:
return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
- CASE_NF(SUB8ri)
- CASE_NF(SUB16ri)
+ CASE_NF(SUB8ri)
+ CASE_NF(SUB16ri)
/// FIXME: Support these similar to ADD8ri/ADD16ri*.
return nullptr;
- CASE_NF(SUB32ri) {
- if (!MI.getOperand(2).isImm())
- return nullptr;
- int64_t Imm = MI.getOperand(2).getImm();
- if (!isInt<32>(-Imm))
- return nullptr;
+ CASE_NF(SUB32ri) {
+ if (!MI.getOperand(2).isImm())
+ return nullptr;
+ int64_t Imm = MI.getOperand(2).getImm();
+ if (!isInt<32>(-Imm))
+ return nullptr;
- assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
- unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+ assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+ unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
- bool isKill;
- MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
- if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
- isKill, ImplicitOp, LV, LIS))
- return nullptr;
+ bool isKill;
+ MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+ if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
+ isKill, ImplicitOp, LV, LIS))
+ return nullptr;
- MachineInstrBuilder MIB =
- BuildMI(MF, MI.getDebugLoc(), get(Opc))
- .add(Dest)
- .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
- if (ImplicitOp.getReg() != 0)
- MIB.add(ImplicitOp);
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI.getDebugLoc(), get(Opc))
+ .add(Dest)
+ .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
+ if (ImplicitOp.getReg() != 0)
+ MIB.add(ImplicitOp);
- NewMI = addOffset(MIB, -Imm);
+ NewMI = addOffset(MIB, -Imm);
- // Add kills if classifyLEAReg created a new register.
- if (LV && SrcReg != Src.getReg())
- LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
- break;
- }
+ // Add kills if classifyLEAReg created a new register.
+ if (LV && SrcReg != Src.getReg())
+ LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+ break;
+ }
- CASE_NF(SUB64ri32) {
- if (!MI.getOperand(2).isImm())
- return nullptr;
- int64_t Imm = MI.getOperand(2).getImm();
- if (!isInt<32>(-Imm))
- return nullptr;
+ CASE_NF(SUB64ri32) {
+ if (!MI.getOperand(2).isImm())
+ return nullptr;
+ int64_t Imm = MI.getOperand(2).getImm();
+ if (!isInt<32>(-Imm))
+ return nullptr;
- assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
+ assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
- MachineInstrBuilder MIB =
- BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
- NewMI = addOffset(MIB, -Imm);
- break;
- }
+ MachineInstrBuilder MIB =
+ BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
+ NewMI = addOffset(MIB, -Imm);
+ break;
+ }
case X86::VMOVDQU8Z128rmk:
case X86::VMOVDQU8Z256rmk:
@@ -2855,17 +2861,17 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::OP##_ND:
switch (Opc) {
- // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
- CASE_ND(SHRD16rri8)
- CASE_ND(SHLD16rri8)
- CASE_ND(SHRD32rri8)
- CASE_ND(SHLD32rri8)
- CASE_ND(SHRD64rri8)
- CASE_ND(SHLD64rri8) {
- unsigned Size;
- switch (Opc) {
- default:
- llvm_unreachable("Unreachable!");
+ // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
+ CASE_ND(SHRD16rri8)
+ CASE_ND(SHLD16rri8)
+ CASE_ND(SHRD32rri8)
+ CASE_ND(SHLD32rri8)
+ CASE_ND(SHRD64rri8)
+ CASE_ND(SHLD64rri8) {
+ unsigned Size;
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unreachable!");
#define FROM_TO_SIZE(A, B, S) \
case X86::A: \
Opc = X86::B; \
@@ -2884,16 +2890,16 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
Size = S; \
break;
- FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
- FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
- FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
+ FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
+ FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
+ FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
#undef FROM_TO_SIZE
+ }
+ WorkingMI = CloneIfNew(MI);
+ WorkingMI->setDesc(get(Opc));
+ WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
+ break;
}
- WorkingMI = CloneIfNew(MI);
- WorkingMI->setDesc(get(Opc));
- WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
- break;
- }
case X86::PFSUBrr:
case X86::PFSUBRrr:
// PFSUB x, y: x = x - y
@@ -3177,15 +3183,16 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
WorkingMI = CloneIfNew(MI);
WorkingMI->setDesc(get(Opc));
break;
- CASE_ND(CMOV16rr)
- CASE_ND(CMOV32rr)
- CASE_ND(CMOV64rr) {
- WorkingMI = CloneIfNew(MI);
- unsigned OpNo = MI.getDesc().getNumOperands() - 1;
- X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
- WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
- break;
- }
+ CASE_ND(CMOV16rr)
+ CASE_ND(CMOV32rr)
+ CASE_ND(CMOV64rr) {
+ WorkingMI = CloneIfNew(MI);
+ unsigned OpNo = MI.getDesc().getNumOperands() - 1;
+ X86::CondCode CC =
+ static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
+ WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
+ break;
+ }
case X86::VPTERNLOGDZrri:
case X86::VPTERNLOGDZrmi:
case X86::VPTERNLOGDZ128rri:
@@ -5393,29 +5400,29 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
CmpMask = CmpValue = 0;
}
return true;
- // A SUB can be used to perform comparison.
- CASE_ND(SUB64rm)
- CASE_ND(SUB32rm)
- CASE_ND(SUB16rm)
- CASE_ND(SUB8rm)
+ // A SUB can be used to perform comparison.
+ CASE_ND(SUB64rm)
+ CASE_ND(SUB32rm)
+ CASE_ND(SUB16rm)
+ CASE_ND(SUB8rm)
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = 0;
CmpValue = 0;
return true;
- CASE_ND(SUB64rr)
- CASE_ND(SUB32rr)
- CASE_ND(SUB16rr)
- CASE_ND(SUB8rr)
+ CASE_ND(SUB64rr)
+ CASE_ND(SUB32rr)
+ CASE_ND(SUB16rr)
+ CASE_ND(SUB8rr)
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = MI.getOperand(2).getReg();
CmpMask = 0;
CmpValue = 0;
return true;
- CASE_ND(SUB64ri32)
- CASE_ND(SUB32ri)
- CASE_ND(SUB16ri)
- CASE_ND(SUB8ri)
+ CASE_ND(SUB64ri32)
+ CASE_ND(SUB32ri)
+ CASE_ND(SUB16ri)
+ CASE_ND(SUB8ri)
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
if (MI.getOperand(2).isImm()) {
@@ -5470,27 +5477,27 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
case X86::CMP32rr:
case X86::CMP16rr:
case X86::CMP8rr:
- CASE_ND(SUB64rr)
- CASE_ND(SUB32rr)
- CASE_ND(SUB16rr)
- CASE_ND(SUB8rr) {
- Register OISrcReg;
- Register OISrcReg2;
- int64_t OIMask;
- int64_t OIValue;
- if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
- OIMask != ImmMask || OIValue != ImmValue)
+ CASE_ND(SUB64rr)
+ CASE_ND(SUB32rr)
+ CASE_ND(SUB16rr)
+ CASE_ND(SUB8rr) {
+ Register OISrcReg;
+ Register OISrcReg2;
+ int64_t OIMask;
+ int64_t OIValue;
+ if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
+ OIMask != ImmMask || OIValue != ImmValue)
+ return false;
+ if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
+ *IsSwapped = false;
+ return true;
+ }
+ if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
+ *IsSwapped = true;
+ return true;
+ }
return false;
- if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
- *IsSwapped = false;
- return true;
- }
- if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
- *IsSwapped = true;
- return true;
}
- return false;
- }
case X86::CMP64ri32:
case X86::CMP32ri:
case X86::CMP16ri:
@@ -5499,10 +5506,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
case X86::TEST32ri:
case X86::TEST16ri:
case X86::TEST8ri:
- CASE_ND(SUB64ri32)
- CASE_ND(SUB32ri)
- CASE_ND(SUB16ri)
- CASE_ND(SUB8ri)
+ CASE_ND(SUB64ri32)
+ CASE_ND(SUB32ri)
+ CASE_ND(SUB16ri)
+ CASE_ND(SUB8ri)
case X86::TEST64rr:
case X86::TEST32rr:
case X86::TEST16rr:
@@ -5559,98 +5566,98 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
default:
return false;
- // The shift instructions only modify ZF if their shift count is non-zero.
- // N.B.: The processor truncates the shift count depending on the encoding.
- CASE_ND(SAR8ri)
- CASE_ND(SAR16ri)
- CASE_ND(SAR32ri)
- CASE_ND(SAR64ri)
- CASE_ND(SHR8ri)
- CASE_ND(SHR16ri)
- CASE_ND(SHR32ri)
- CASE_ND(SHR64ri)
+ // The shift instructions only modify ZF if their shift count is non-zero.
+ // N.B.: The processor truncates the shift count depending on the encoding.
+ CASE_ND(SAR8ri)
+ CASE_ND(SAR16ri)
+ CASE_ND(SAR32ri)
+ CASE_ND(SAR64ri)
+ CASE_ND(SHR8ri)
+ CASE_ND(SHR16ri)
+ CASE_ND(SHR32ri)
+ CASE_ND(SHR64ri)
return getTruncatedShiftCount(MI, 2) != 0;
- // Some left shift instructions can be turned into LEA instructions but only
- // if their flags aren't used. Avoid transforming such instructions.
- CASE_ND(SHL8ri)
- CASE_ND(SHL16ri)
- CASE_ND(SHL32ri)
- CASE_ND(SHL64ri) {
- unsigned ShAmt = getTruncatedShiftCount(MI, 2);
- if (isTruncatedShiftCountForLEA(ShAmt))
- return false;
- return ShAmt != 0;
- }
+ // Some left shift instructions can be turned into LEA instructions but only
+ // if their flags aren't used. Avoid transforming such instructions.
+ CASE_ND(SHL8ri)
+ CASE_ND(SHL16ri)
+ CASE_ND(SHL32ri)
+ CASE_ND(SHL64ri) {
+ unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+ if (isTruncatedShiftCountForLEA(ShAmt))
+ return false;
+ return ShAmt != 0;
+ }
- CASE_ND(SHRD16rri8)
- CASE_ND(SHRD32rri8)
- CASE_ND(SHRD64rri8)
- CASE_ND(SHLD16rri8)
- CASE_ND(SHLD32rri8)
- CASE_ND(SHLD64rri8)
+ CASE_ND(SHRD16rri8)
+ CASE_ND(SHRD32rri8)
+ CASE_ND(SHRD64rri8)
+ CASE_ND(SHLD16rri8)
+ CASE_ND(SHLD32rri8)
+ CASE_ND(SHLD64rri8)
return getTruncatedShiftCount(MI, 3) != 0;
- CASE_ND(SUB64ri32)
- CASE_ND(SUB32ri)
- CASE_ND(SUB16ri)
- CASE_ND(SUB8ri)
- CASE_ND(SUB64rr)
- CASE_ND(SUB32rr)
- CASE_ND(SUB16rr)
- CASE_ND(SUB8rr)
- CASE_ND(SUB64rm)
- CASE_ND(SUB32rm)
- CASE_ND(SUB16rm)
- CASE_ND(SUB8rm)
- CASE_ND(DEC64r)
- CASE_ND(DEC32r)
- CASE_ND(DEC16r)
- CASE_ND(DEC8r)
- CASE_ND(ADD64ri32)
- CASE_ND(ADD32ri)
- CASE_ND(ADD16ri)
- CASE_ND(ADD8ri)
- CASE_ND(ADD64rr)
- CASE_ND(ADD32rr)
- CASE_ND(ADD16rr)
- CASE_ND(ADD8rr)
- CASE_ND(ADD64rm)
- CASE_ND(ADD32rm)
- CASE_ND(ADD16rm)
- CASE_ND(ADD8rm)
- CASE_ND(INC64r)
- CASE_ND(INC32r)
- CASE_ND(INC16r)
- CASE_ND(INC8r)
- CASE_ND(ADC64ri32)
- CASE_ND(ADC32ri)
- CASE_ND(ADC16ri)
- CASE_ND(ADC8ri)
- CASE_ND(ADC64rr)
- CASE_ND(ADC32rr)
- CASE_ND(ADC16rr)
- CASE_ND(ADC8rr)
- CASE_ND(ADC64rm)
- CASE_ND(ADC32rm)
- CASE_ND(ADC16rm)
- CASE_ND(ADC8rm)
- CASE_ND(SBB64ri32)
- CASE_ND(SBB32ri)
- CASE_ND(SBB16ri)
- CASE_ND(SBB8ri)
- CASE_ND(SBB64rr)
- CASE_ND(SBB32rr)
- CASE_ND(SBB16rr)
- CASE_ND(SBB8rr)
- CASE_ND(SBB64rm)
- CASE_ND(SBB32rm)
- CASE_ND(SBB16rm)
- CASE_ND(SBB8rm)
- CASE_ND(NEG8r)
- CASE_ND(NEG16r)
- CASE_ND(NEG32r)
- CASE_ND(NEG64r)
+ CASE_ND(SUB64ri32)
+ CASE_ND(SUB32ri)
+ CASE_ND(SUB16ri)
+ CASE_ND(SUB8ri)
+ CASE_ND(SUB64rr)
+ CASE_ND(SUB32rr)
+ CASE_ND(SUB16rr)
+ CASE_ND(SUB8rr)
+ CASE_ND(SUB64rm)
+ CASE_ND(SUB32rm)
+ CASE_ND(SUB16rm)
+ CASE_ND(SUB8rm)
+ CASE_ND(DEC64r)
+ CASE_ND(DEC32r)
+ CASE_ND(DEC16r)
+ CASE_ND(DEC8r)
+ CASE_ND(ADD64ri32)
+ CASE_ND(ADD32ri)
+ CASE_ND(ADD16ri)
+ CASE_ND(ADD8ri)
+ CASE_ND(ADD64rr)
+ CASE_ND(ADD32rr)
+ CASE_ND(ADD16rr)
+ CASE_ND(ADD8rr)
+ CASE_ND(ADD64rm)
+ CASE_ND(ADD32rm)
+ CASE_ND(ADD16rm)
+ CASE_ND(ADD8rm)
+ CASE_ND(INC64r)
+ CASE_ND(INC32r)
+ CASE_ND(INC16r)
+ CASE_ND(INC8r)
+ CASE_ND(ADC64ri32)
+ CASE_ND(ADC32ri)
+ CASE_ND(ADC16ri)
+ CASE_ND(ADC8ri)
+ CASE_ND(ADC64rr)
+ CASE_ND(ADC32rr)
+ CASE_ND(ADC16rr)
+ CASE_ND(ADC8rr)
+ CASE_ND(ADC64rm)
+ CASE_ND(ADC32rm)
+ CASE_ND(ADC16rm)
+ CASE_ND(ADC8rm)
+ CASE_ND(SBB64ri32)
+ CASE_ND(SBB32ri)
+ CASE_ND(SBB16ri)
+ CASE_ND(SBB8ri)
+ CASE_ND(SBB64rr)
+ CASE_ND(SBB32rr)
+ CASE_ND(SBB16rr)
+ CASE_ND(SBB8rr)
+ CASE_ND(SBB64rm)
+ CASE_ND(SBB32rm)
+ CASE_ND(SBB16rm)
+ CASE_ND(SBB8rm)
+ CASE_ND(NEG8r)
+ CASE_ND(NEG16r)
+ CASE_ND(NEG32r)
+ CASE_ND(NEG64r)
case X86::LZCNT16rr:
case X86::LZCNT16rm:
case X86::LZCNT32rr:
@@ -5670,42 +5677,42 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
case X86::TZCNT64rr:
case X86::TZCNT64rm:
return true;
- CASE_ND(AND64ri32)
- CASE_ND(AND32ri)
- CASE_ND(AND16ri)
- CASE_ND(AND8ri)
- CASE_ND(AND64rr)
- CASE_ND(AND32rr)
- CASE_ND(AND16rr)
- CASE_ND(AND8rr)
- CASE_ND(AND64rm)
- CASE_ND(AND32rm)
- CASE_ND(AND16rm)
- CASE_ND(AND8rm)
- CASE_ND(XOR64ri32)
- CASE_ND(XOR32ri)
- CASE_ND(XOR16ri)
- CASE_ND(XOR8ri)
- CASE_ND(XOR64rr)
- CASE_ND(XOR32rr)
- CASE_ND(XOR16rr)
- CASE_ND(XOR8rr)
- CASE_ND(XOR64rm)
- CASE_ND(XOR32rm)
- CASE_ND(XOR16rm)
- CASE_ND(XOR8rm)
- CASE_ND(OR64ri32)
- CASE_ND(OR32ri)
- CASE_ND(OR16ri)
- CASE_ND(OR8ri)
- CASE_ND(OR64rr)
- CASE_ND(OR32rr)
- CASE_ND(OR16rr)
- CASE_ND(OR8rr)
- CASE_ND(OR64rm)
- CASE_ND(OR32rm)
- CASE_ND(OR16rm)
- CASE_ND(OR8rm)
+ CASE_ND(AND64ri32)
+ CASE_ND(AND32ri)
+ CASE_ND(AND16ri)
+ CASE_ND(AND8ri)
+ CASE_ND(AND64rr)
+ CASE_ND(AND32rr)
+ CASE_ND(AND16rr)
+ CASE_ND(AND8rr)
+ CASE_ND(AND64rm)
+ CASE_ND(AND32rm)
+ CASE_ND(AND16rm)
+ CASE_ND(AND8rm)
+ CASE_ND(XOR64ri32)
+ CASE_ND(XOR32ri)
+ CASE_ND(XOR16ri)
+ CASE_ND(XOR8ri)
+ CASE_ND(XOR64rr)
+ CASE_ND(XOR32rr)
+ CASE_ND(XOR16rr)
+ CASE_ND(XOR8rr)
+ CASE_ND(XOR64rm)
+ CASE_ND(XOR32rm)
+ CASE_ND(XOR16rm)
+ CASE_ND(XOR8rm)
+ CASE_ND(OR64ri32)
+ CASE_ND(OR32ri)
+ CASE_ND(OR16ri)
+ CASE_ND(OR8ri)
+ CASE_ND(OR64rr)
+ CASE_ND(OR32rr)
+ CASE_ND(OR16rr)
+ CASE_ND(OR8rr)
+ CASE_ND(OR64rm)
+ CASE_ND(OR32rm)
+ CASE_ND(OR16rm)
+ CASE_ND(OR8rm)
case X86::ANDN32rr:
case X86::ANDN32rm:
case X86::ANDN64rr:
@@ -5783,15 +5790,17 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
}
/// Check whether the use can be converted to remove a comparison against zero.
-/// Returns the EFLAGS condition and the operand that we are comparing against zero.
-static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
+/// Returns the EFLAGS condition and the operand that we are comparing against
+/// zero.
+static std::pair<X86::CondCode, unsigned>
+isUseDefConvertible(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
return std::make_pair(X86::COND_INVALID, ~0U);
- CASE_ND(NEG8r)
- CASE_ND(NEG16r)
- CASE_ND(NEG32r)
- CASE_ND(NEG64r)
+ CASE_ND(NEG8r)
+ CASE_ND(NEG16r)
+ CASE_ND(NEG32r)
+ CASE_ND(NEG64r)
return std::make_pair(X86::COND_AE, 1U);
case X86::LZCNT16rr:
case X86::LZCNT32rr:
@@ -5835,51 +5844,53 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
switch (CmpInstr.getOpcode()) {
default:
break;
- CASE_ND(SUB64ri32)
- CASE_ND(SUB32ri)
- CASE_ND(SUB16ri)
- CASE_ND(SUB8ri)
- CASE_ND(SUB64rm)
- CASE_ND(SUB32rm)
- CASE_ND(SUB16rm)
- CASE_ND(SUB8rm)
- CASE_ND(SUB64rr)
- CASE_ND(SUB32rr)
- CASE_ND(SUB16rr)
- CASE_ND(SUB8rr) {
- if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
- return false;
- // There is no use of the destination register, we can replace SUB with CMP.
- unsigned NewOpcode = 0;
+ CASE_ND(SUB64ri32)
+ CASE_ND(SUB32ri)
+ CASE_ND(SUB16ri)
+ CASE_ND(SUB8ri)
+ CASE_ND(SUB64rm)
+ CASE_ND(SUB32rm)
+ CASE_ND(SUB16rm)
+ CASE_ND(SUB8rm)
+ CASE_ND(SUB64rr)
+ CASE_ND(SUB32rr)
+ CASE_ND(SUB16rr)
+ CASE_ND(SUB8rr) {
+ if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
+ return false;
+ // There is no use of the destination register, we can replace SUB with
+ // CMP.
+ unsigned NewOpcode = 0;
#define FROM_TO(A, B) \
CASE_ND(A) NewOpcode = X86::B; \
break;
- switch (CmpInstr.getOpcode()) {
- default:
- llvm_unreachable("Unreachable!");
- FROM_TO(SUB64rm, CMP64rm)
- FROM_TO(SUB32rm, CMP32rm)
- FROM_TO(SUB16rm, CMP16rm)
- FROM_TO(SUB8rm, CMP8rm)
- FROM_TO(SUB64rr, CMP64rr)
- FROM_TO(SUB32rr, CMP32rr)
- FROM_TO(SUB16rr, CMP16rr)
- FROM_TO(SUB8rr, CMP8rr)
- FROM_TO(SUB64ri32, CMP64ri32)
- FROM_TO(SUB32ri, CMP32ri)
- FROM_TO(SUB16ri, CMP16ri)
- FROM_TO(SUB8ri, CMP8ri)
- }
+ switch (CmpInstr.getOpcode()) {
+ default:
+ llvm_unreachable("Unreachable!");
+ FROM_TO(SUB64rm, CMP64rm)
+ FROM_TO(SUB32rm, CMP32rm)
+ FROM_TO(SUB16rm, CMP16rm)
+ FROM_TO(SUB8rm, CMP8rm)
+ FROM_TO(SUB64rr, CMP64rr)
+ FROM_TO(SUB32rr, CMP32rr)
+ FROM_TO(SUB16rr, CMP16rr)
+ FROM_TO(SUB8rr, CMP8rr)
+ FROM_TO(SUB64ri32, CMP64ri32)
+ FROM_TO(SUB32ri, CMP32ri)
+ FROM_TO(SUB16ri, CMP16ri)
+ FROM_TO(SUB8ri, CMP8ri)
+ }
#undef FROM_TO
- CmpInstr.setDesc(get(NewOpcode));
- CmpInstr.removeOperand(0);
- // Mutating this instruction invalidates any debug data associated with it.
- CmpInstr.dropDebugNumber();
- // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
- if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
- NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
- return false;
- }
+ CmpInstr.setDesc(get(NewOpcode));
+ CmpInstr.removeOperand(0);
+ // Mutating this instruction invalidates any debug data associated with
+ // it.
+ CmpInstr.dropDebugNumber();
+ // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+ if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+ NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+ return false;
+ }
}
// The following code tries to remove the comparison by re-using EFLAGS
@@ -6236,14 +6247,14 @@ static bool canConvert2Copy(unsigned Opc) {
switch (Opc) {
default:
return false;
- CASE_ND(ADD64ri32)
- CASE_ND(SUB64ri32)
- CASE_ND(OR64ri32)
- CASE_ND(XOR64ri32)
- CASE_ND(ADD32ri)
- CASE_ND(SUB32ri)
- CASE_ND(OR32ri)
- CASE_ND(XOR32ri)
+ CASE_ND(ADD64ri32)
+ CASE_ND(SUB64ri32)
+ CASE_ND(OR64ri32)
+ CASE_ND(XOR64ri32)
+ CASE_ND(ADD32ri)
+ CASE_ND(SUB32ri)
+ CASE_ND(OR32ri)
+ CASE_ND(XOR32ri)
return true;
}
}
@@ -9627,7 +9638,7 @@ Register X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
static const uint16_t *lookup(unsigned opcode, unsigned domain,
ArrayRef<uint16_t[3]> Table) {
- for (const uint16_t(&Row)[3] : Table)
+ for (const uint16_t (&Row)[3] : Table)
if (Row[domain - 1] == opcode)
return Row;
return nullptr;
@@ -9636,7 +9647,7 @@ static const uint16_t *lookup(unsigned opcode, unsigned domain,
static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
ArrayRef<uint16_t[4]> Table) {
// If this is the integer domain make sure to check both integer columns.
- for (const uint16_t(&Row)[4] : Table)
+ for (const uint16_t (&Row)[4] : Table)
if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
return Row;
return nullptr;
@@ -10392,25 +10403,25 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
if (Invert)
return false;
switch (Inst.getOpcode()) {
- CASE_ND(ADD8rr)
- CASE_ND(ADD16rr)
- CASE_ND(ADD32rr)
- CASE_ND(ADD64rr)
- CASE_ND(AND8rr)
- CASE_ND(AND16rr)
- CASE_ND(AND32rr)
- CASE_ND(AND64rr)
- CASE_ND(OR8rr)
- CASE_ND(OR16rr)
- CASE_ND(OR32rr)
- CASE_ND(OR64rr)
- CASE_ND(XOR8rr)
- CASE_ND(XOR16rr)
- CASE_ND(XOR32rr)
- CASE_ND(XOR64rr)
- CASE_ND(IMUL16rr)
- CASE_ND(IMUL32rr)
- CASE_ND(IMUL64rr)
+ CASE_ND(ADD8rr)
+ CASE_ND(ADD16rr)
+ CASE_ND(ADD32rr)
+ CASE_ND(ADD64rr)
+ CASE_ND(AND8rr)
+ CASE_ND(AND16rr)
+ CASE_ND(AND32rr)
+ CASE_ND(AND64rr)
+ CASE_ND(OR8rr)
+ CASE_ND(OR16rr)
+ CASE_ND(OR32rr)
+ CASE_ND(OR64rr)
+ CASE_ND(XOR8rr)
+ CASE_ND(XOR16rr)
+ CASE_ND(XOR32rr)
+ CASE_ND(XOR64rr)
+ CASE_ND(IMUL16rr)
+ CASE_ND(IMUL32rr)
+ CASE_ND(IMUL64rr)
case X86::PANDrr:
case X86::PORrr:
case X86::PXORrr:
@@ -11451,8 +11462,8 @@ bool X86InstrInfo::getMachineCombinerPatterns(
break;
}
}
- return TargetInstrInfo::getMachineCombinerPatterns(Root,
- Patterns, DoRegPressureReduce);
+ return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+ DoRegPressureReduce);
}
static void
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index ebd7e070d5fe8..93fcfa2f288f3 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -320,8 +320,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
Register isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
- Register isLoadFromStackSlot(const MachineInstr &MI,
- int &FrameIndex,
+ Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex,
TypeSize &MemBytes) const override;
/// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
/// stack locations as well. This uses a heuristic so it isn't
@@ -331,8 +330,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
Register isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
- Register isStoreToStackSlot(const MachineInstr &MI,
- int &FrameIndex,
+ Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex,
TypeSize &MemBytes) const override;
/// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
/// stack locations as well. This uses a heuristic so it isn't
@@ -494,12 +492,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
/// is likely that the referenced instruction has been changed.
///
/// \returns true on success.
- MachineInstr *
- foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
- ArrayRef<unsigned> Ops,
- MachineBasicBlock::iterator InsertPt, int FrameIndex,
- LiveIntervals *LIS = nullptr,
- VirtRegMap *VRM = nullptr) const override;
+ MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+ ArrayRef<unsigned> Ops,
+ MachineBasicBlock::iterator InsertPt,
+ int FrameIndex,
+ LiveIntervals *LIS = nullptr,
+ VirtRegMap *VRM = nullptr) const override;
/// Same as the previous version except it allows folding of any load and
/// store from / to any address, not just from a specific stack slot.
@@ -748,8 +746,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
///
/// If IsIntrinsic is set, operand 1 will be ignored for commuting.
bool findThreeSrcCommutedOpIndices(const MachineInstr &MI,
- unsigned &SrcOpIdx1,
- unsigned &SrcOpIdx2,
+ unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
bool IsIntrinsic = false) const;
/// Returns true when instruction \p FlagI produces the same flags as \p OI.
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 66c9d75053640..33b5ae0eb8f7a 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -54,9 +54,10 @@
using namespace llvm;
-static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
- cl::desc("Enable the machine combiner pass"),
- cl::init(true), cl::Hidden);
+static cl::opt<bool>
+ EnableMachineCombinerPass("x86-machine-combiner",
+ cl::desc("Enable the machine combiner pass"),
+ cl::init(true), cl::Hidden);
static cl::opt<bool>
EnableTileRAPass("x86-tile-ra",
@@ -362,7 +363,7 @@ namespace {
class X86PassConfig : public TargetPassConfig {
public:
X86PassConfig(X86TargetMachine &TM, PassManagerBase &PM)
- : TargetPassConfig(TM, PM) {}
+ : TargetPassConfig(TM, PM) {}
X86TargetMachine &getX86TargetMachine() const {
return getTM<X86TargetMachine>();
@@ -401,10 +402,10 @@ char X86ExecutionDomainFix::ID;
} // end anonymous namespace
INITIALIZE_PASS_BEGIN(X86ExecutionDomainFix, "x86-execution-domain-fix",
- "X86 Execution Domain Fix", false, false)
+ "X86 Execution Domain Fix", false, false)
INITIALIZE_PASS_DEPENDENCY(ReachingDefInfoWrapperPass)
INITIALIZE_PASS_END(X86ExecutionDomainFix, "x86-execution-domain-fix",
- "X86 Execution Domain Fix", false, false)
+ "X86 Execution Domain Fix", false, false)
TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
return new X86PassConfig(*this, PM);
@@ -621,7 +622,7 @@ void X86PassConfig::addPreEmitPass2() {
(TT.isOSDarwin() &&
(M->getFunction("objc_retainAutoreleasedReturnValue") ||
M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) ||
- F.hasFnAttribute("ct-select");
+ F.hasFnAttribute("ct-select");
}));
// Analyzes and emits pseudos to support Win x64 Unwind V2. This pass must run
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
index ea943307c644f..eec38fa581c6f 100644
--- a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
+++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
@@ -209,94 +209,84 @@ define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind
define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
; I386-NOCMOV-LABEL: test_ctselect_f80_basic:
; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: pushl %ebx
; I386-NOCMOV-NEXT: pushl %edi
; I386-NOCMOV-NEXT: pushl %esi
-; I386-NOCMOV-NEXT: subl $12, %esp
-; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: sete %al
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: subl $40, %esp
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: testb $1, %cl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
-; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
-; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, (%esp)
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: sete %ch
+; I386-NOCMOV-NEXT: movb %ch, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edi, %ebx
+; I386-NOCMOV-NEXT: andl %ebp, %ebx
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %edx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %ebx
+; I386-NOCMOV-NEXT: testb $1, %cl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
-; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
-; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: sete %ch
+; I386-NOCMOV-NEXT: movb %ch, %cl
+; I386-NOCMOV-NEXT: movzbl %cl, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edx, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %eax, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl (%esp), %ebx # 4-byte Reload
+; I386-NOCMOV-NEXT: movb %al, %dl
+; I386-NOCMOV-NEXT: movzbl %dl, %edi
; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, %ecx
+; I386-NOCMOV-NEXT: andl %edi, %ecx
; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: fldt (%esp)
-; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: andl %ebx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: addl $40, %esp
; I386-NOCMOV-NEXT: popl %esi
; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: popl %ebp
; I386-NOCMOV-NEXT: retl
;
; I386-CMOV-LABEL: test_ctselect_f80_basic:
; I386-CMOV: # %bb.0:
-; I386-CMOV-NEXT: pushl %edi
-; I386-CMOV-NEXT: pushl %esi
-; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: subl $36, %esp
+; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, (%esp)
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, (%esp)
; I386-CMOV-NEXT: fldt (%esp)
-; I386-CMOV-NEXT: addl $12, %esp
-; I386-CMOV-NEXT: popl %esi
-; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: addl $36, %esp
; I386-CMOV-NEXT: retl
%result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
ret x86_fp80 %result
@@ -543,94 +533,84 @@ define float @test_ctselect_f32_nan(i1 %cond) nounwind {
define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
; I386-NOCMOV-LABEL: test_ctselect_f80_alignment:
; I386-NOCMOV: # %bb.0:
+; I386-NOCMOV-NEXT: pushl %ebp
+; I386-NOCMOV-NEXT: pushl %ebx
; I386-NOCMOV-NEXT: pushl %edi
; I386-NOCMOV-NEXT: pushl %esi
-; I386-NOCMOV-NEXT: subl $12, %esp
-; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: sete %al
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: subl $40, %esp
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: testb $1, %cl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT: movl %eax, (%esp) # 4-byte Spill
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
-; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
-; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, (%esp)
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT: sete %ch
+; I386-NOCMOV-NEXT: movb %ch, %al
+; I386-NOCMOV-NEXT: movzbl %al, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edi, %ebx
+; I386-NOCMOV-NEXT: andl %ebp, %ebx
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %edx, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %ebx
+; I386-NOCMOV-NEXT: testb $1, %cl
+; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
-; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
-; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT: movb %al, %ah
-; I386-NOCMOV-NEXT: movzbl %ah, %edi
+; I386-NOCMOV-NEXT: sete %ch
+; I386-NOCMOV-NEXT: movb %ch, %cl
+; I386-NOCMOV-NEXT: movzbl %cl, %ebp
+; I386-NOCMOV-NEXT: negl %ebp
+; I386-NOCMOV-NEXT: movl %edx, %edi
+; I386-NOCMOV-NEXT: andl %ebp, %edi
+; I386-NOCMOV-NEXT: notl %ebp
+; I386-NOCMOV-NEXT: andl %eax, %ebp
+; I386-NOCMOV-NEXT: orl %ebp, %edi
+; I386-NOCMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: sete %al
+; I386-NOCMOV-NEXT: movl (%esp), %ebx # 4-byte Reload
+; I386-NOCMOV-NEXT: movb %al, %dl
+; I386-NOCMOV-NEXT: movzbl %dl, %edi
; I386-NOCMOV-NEXT: negl %edi
-; I386-NOCMOV-NEXT: movl %edx, %esi
-; I386-NOCMOV-NEXT: andl %edi, %esi
+; I386-NOCMOV-NEXT: movl %esi, %ecx
+; I386-NOCMOV-NEXT: andl %edi, %ecx
; I386-NOCMOV-NEXT: notl %edi
-; I386-NOCMOV-NEXT: andl %ecx, %edi
-; I386-NOCMOV-NEXT: orl %edi, %esi
-; I386-NOCMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT: fldt (%esp)
-; I386-NOCMOV-NEXT: addl $12, %esp
+; I386-NOCMOV-NEXT: andl %ebx, %edi
+; I386-NOCMOV-NEXT: orl %edi, %ecx
+; I386-NOCMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT: addl $40, %esp
; I386-NOCMOV-NEXT: popl %esi
; I386-NOCMOV-NEXT: popl %edi
+; I386-NOCMOV-NEXT: popl %ebx
+; I386-NOCMOV-NEXT: popl %ebp
; I386-NOCMOV-NEXT: retl
;
; I386-CMOV-LABEL: test_ctselect_f80_alignment:
; I386-CMOV: # %bb.0:
-; I386-CMOV-NEXT: pushl %edi
-; I386-CMOV-NEXT: pushl %esi
-; I386-CMOV-NEXT: subl $12, %esp
+; I386-CMOV-NEXT: subl $36, %esp
+; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT: sete %al
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, (%esp)
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT: movb %al, %ah
-; I386-CMOV-NEXT: movzbl %ah, %edi
-; I386-CMOV-NEXT: negl %edi
-; I386-CMOV-NEXT: movl %edx, %esi
-; I386-CMOV-NEXT: andl %edi, %esi
-; I386-CMOV-NEXT: notl %edi
-; I386-CMOV-NEXT: andl %ecx, %edi
-; I386-CMOV-NEXT: orl %edi, %esi
-; I386-CMOV-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT: movl %eax, (%esp)
; I386-CMOV-NEXT: fldt (%esp)
-; I386-CMOV-NEXT: addl $12, %esp
-; I386-CMOV-NEXT: popl %esi
-; I386-CMOV-NEXT: popl %edi
+; I386-CMOV-NEXT: addl $36, %esp
; I386-CMOV-NEXT: retl
%result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
ret x86_fp80 %result
More information about the llvm-branch-commits
mailing list