[llvm-branch-commits] [llvm] [ConstantTime] Native ct.select support for X86 and i386 (PR #166704)

Thu Nov 6 10:27:04 PST 2025

https://github.com/wizardengineer updated https://github.com/llvm/llvm-project/pull/166704

>From cee41562976955a1e4c7b911a304b989a73be16d Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 17:09:23 -0500
Subject: [PATCH 1/2] [LLVM][X86] Add native ct.select support for X86 and i386

Add native X86 implementation with CMOV instructions and comprehensive tests:
- X86 ISelLowering with CMOV for x86_64 and i386
- Fallback bitwise operations for i386 targets without CMOV
- Post-RA expansion for pseudo-instructions
- Comprehensive test coverage:
  - Edge cases (zero conditions, large integers)
  - i386-specific tests (FP, MMX, non-CMOV fallback)
  - Vector operations
  - Optimization patterns

The basic test demonstrating fallback is in the core infrastructure PR.
---
 llvm/lib/Target/X86/X86.td                    |    8 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  791 +++++++++-
 llvm/lib/Target/X86/X86ISelLowering.h         |    7 +
 llvm/lib/Target/X86/X86InstrCMovSetCC.td      |  205 +++
 llvm/lib/Target/X86/X86InstrCompiler.td       |   81 ++
 llvm/lib/Target/X86/X86InstrFragments.td      |    5 +
 llvm/lib/Target/X86/X86InstrInfo.cpp          |  609 +++++++-
 llvm/lib/Target/X86/X86InstrInfo.h            |    6 +
 llvm/lib/Target/X86/X86InstrPredicates.td     |    5 +
 llvm/lib/Target/X86/X86TargetMachine.cpp      |    5 +-
 llvm/test/CodeGen/X86/ctselect-edge-cases.ll  |  409 ++++++
 llvm/test/CodeGen/X86/ctselect-i386-fp.ll     |  722 ++++++++++
 llvm/test/CodeGen/X86/ctselect-i386-mmx.ll    |  428 ++++++
 llvm/test/CodeGen/X86/ctselect-i386.ll        |  267 ++++
 .../test/CodeGen/X86/ctselect-optimization.ll |  304 ++++
 llvm/test/CodeGen/X86/ctselect-vector.ll      | 1274 +++++++++++++++++
 llvm/test/CodeGen/X86/ctselect.ll             |  996 +++++++------
 17 files changed, 5671 insertions(+), 451 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/ctselect-edge-cases.ll
 create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-fp.ll
 create mode 100644 llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
 create mode 100644 llvm/test/CodeGen/X86/ctselect-i386.ll
 create mode 100644 llvm/test/CodeGen/X86/ctselect-optimization.ll
 create mode 100644 llvm/test/CodeGen/X86/ctselect-vector.ll

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 9e291a6ae431f..21826d8289bb9 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -825,9 +825,10 @@ include "X86SchedSapphireRapids.td"
 
 def ProcessorFeatures {
   // x86-64 micro-architecture levels: x86-64 and x86-64-v[234]
-  list<SubtargetFeature> X86_64V1Features = [
-    FeatureX87, FeatureCX8, FeatureCMOV, FeatureMMX, FeatureSSE2,
-    FeatureFXSR, FeatureNOPL, FeatureX86_64,
+  list<SubtargetFeature> X86_64V1Features = [FeatureX87, FeatureCX8,
+                                             FeatureCMOV, FeatureMMX,
+                                             FeatureSSE2, FeatureFXSR,
+                                             FeatureNOPL, FeatureX86_64,
   ];
   list<SubtargetFeature> X86_64V1Tuning = [
     TuningMacroFusion,
@@ -1161,6 +1162,7 @@ def ProcessorFeatures {
                                                   FeatureAVXNECONVERT,
                                                   FeatureAVXVNNIINT8,
                                                   FeatureAVXVNNIINT16,
+                                                  FeatureUSERMSR,
                                                   FeatureSHA512,
                                                   FeatureSM3,
                                                   FeatureEGPR,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6edf0185df813..833afa717c32c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86ISelLowering.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86ShuffleDecode.h"
 #include "X86.h"
 #include "X86FrameLowering.h"
@@ -29,6 +30,8 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -48,6 +51,7 @@
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
@@ -488,6 +492,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // X86 wants to expand cmov itself.
   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
     setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::CTSELECT, VT, Custom);
     setOperationAction(ISD::SETCC, VT, Custom);
     setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
@@ -496,11 +501,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::CTSELECT, VT, Custom);
     setOperationAction(ISD::SETCC,  VT, Custom);
   }
 
   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
+  setOperationAction(ISD::CTSELECT, MVT::x86mmx, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
 
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
@@ -630,6 +637,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::BR_CC, VT, Action);
     setOperationAction(ISD::SETCC, VT, Action);
     setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::CTSELECT, VT, Custom);
     setOperationAction(ISD::SELECT_CC, VT, Action);
     setOperationAction(ISD::FROUND, VT, Action);
     setOperationAction(ISD::FROUNDEVEN, VT, Action);
@@ -1067,6 +1075,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v4f32, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom);
 
     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
@@ -1220,6 +1229,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SELECT,             MVT::v8f16, Custom);
     setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
 
+    setOperationAction(ISD::CTSELECT, MVT::v2f64, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v2i64, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v4i32, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v8i16, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v16i8, Custom);
+
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Custom);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
@@ -1541,6 +1557,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
+    setOperationAction(ISD::CTSELECT, MVT::v4f64, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v4i64, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v8i32, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v16i16, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v16f16, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v32i8, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v8f32, Custom);
+
     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
@@ -1727,6 +1751,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
 
     setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
+    setOperationAction(ISD::CTSELECT, MVT::v1i1, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
 
@@ -1772,6 +1797,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
       setOperationAction(ISD::SETCC,            VT, Custom);
       setOperationAction(ISD::SELECT,           VT, Custom);
+      setOperationAction(ISD::CTSELECT, VT, Custom);
       setOperationAction(ISD::TRUNCATE,         VT, Custom);
 
       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
@@ -2038,6 +2064,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
       setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::CTSELECT, VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
@@ -2203,6 +2230,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::CTSELECT, VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
@@ -2269,6 +2297,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::VSELECT,            VT, Legal);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::CTSELECT, VT, Custom);
 
       setOperationAction(ISD::FNEG,               VT, Custom);
       setOperationAction(ISD::FABS,               VT, Custom);
@@ -2538,6 +2567,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::x86amx, &X86::TILERegClass);
   }
 
+  // Handle 512-bit vector CTSELECT without AVX512 by setting them to Expand
+  // This allows type legalization to split them into smaller vectors
+  for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, MVT::v32f16,
+                  MVT::v16f32, MVT::v8f64}) {
+    setOperationAction(ISD::CTSELECT, VT, Expand);
+  }
+
+  // Handle 256-bit vector CTSELECT without AVX by setting them to Expand
+  // This allows type legalization to split them into 128-bit vectors
+  if (!Subtarget.hasAVX()) {
+    for (auto VT : {MVT::v4f64, MVT::v4i64, MVT::v8i32, MVT::v16i16,
+                    MVT::v16f16, MVT::v32i8, MVT::v8f32}) {
+      setOperationAction(ISD::CTSELECT, VT, Expand);
+    }
+  }
+
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -2644,6 +2689,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::BITCAST,
                        ISD::VSELECT,
                        ISD::SELECT,
+                       ISD::CTSELECT,
                        ISD::SHL,
                        ISD::SRA,
                        ISD::SRL,
@@ -25325,6 +25371,174 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,
   return V;
 }
 
+SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond = Op.getOperand(0); // condition
+  SDValue TrueOp = Op.getOperand(1);  // true_value
+  SDValue FalseOp = Op.getOperand(2); // false_value
+  SDLoc DL(Op);
+  MVT VT = TrueOp.getSimpleValueType();
+
+  // Special handling for i386 targets (no CMOV) - route to post-RA expansion
+  // pseudos Let standard type legalization handle i64 automatically (splits
+  // into EDX:EAX)
+
+  // Handle soft float16 by converting to integer operations
+  if (isSoftF16(VT, Subtarget)) {
+    MVT NVT = VT.changeTypeToInteger();
+    SDValue CtSelect =
+        DAG.getNode(ISD::CTSELECT, DL, NVT, Cond, DAG.getBitcast(NVT, FalseOp),
+                    DAG.getBitcast(NVT, TrueOp));
+    return DAG.getBitcast(VT, CtSelect);
+  }
+
+  // Handle vector types
+  if (VT.isVector()) {
+    // Handle soft float16 vectors
+    if (isSoftF16(VT, Subtarget)) {
+      MVT NVT = VT.changeVectorElementTypeToInteger();
+      SDValue CtSelect = DAG.getNode(ISD::CTSELECT, DL, NVT, Cond,
+                                     DAG.getBitcast(NVT, FalseOp),
+                                     DAG.getBitcast(NVT, TrueOp));
+      return DAG.getBitcast(VT, CtSelect);
+    }
+
+    unsigned VectorWidth = VT.getSizeInBits();
+    MVT EltVT = VT.getVectorElementType();
+
+    // 512-bit vectors without AVX512 are now handled by type legalization
+    // (Expand action) 256-bit vectors without AVX are now handled by type
+    // legalization (Expand action)
+
+    if (VectorWidth == 128 && !Subtarget.hasSSE1())
+      return SDValue();
+
+    // Handle special cases for floating point vectors
+    if (EltVT.isFloatingPoint()) {
+      // For vector floating point with AVX, use VBLENDV-style operations
+      if (Subtarget.hasAVX() && (VectorWidth == 256 || VectorWidth == 128)) {
+        // Convert to bitwise operations using the condition
+        MVT IntVT = VT.changeVectorElementTypeToInteger();
+        SDValue IntOp1 = DAG.getBitcast(IntVT, TrueOp);
+        SDValue IntOp2 = DAG.getBitcast(IntVT, FalseOp);
+
+        // Create the CTSELECT node with integer types
+        SDValue IntResult =
+            DAG.getNode(X86ISD::CTSELECT, DL, IntVT, IntOp2, IntOp1,
+                        DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8),
+                        EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget));
+        return DAG.getBitcast(VT, IntResult);
+      }
+    }
+
+    // For integer vectors or when we don't have advanced SIMD support,
+    // use the generic X86 CTSELECT node which will be matched by the patterns
+    SDValue CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+    SDValue EFLAGS = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+    // Create the X86 CTSELECT node - note operand order: true, false, cc, flags
+    return DAG.getNode(X86ISD::CTSELECT, DL, VT, FalseOp, TrueOp, CC, EFLAGS);
+  }
+
+  // Look past (and (setcc_carry (cmp ...)), 1)
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
+      isOneConstant(Cond.getOperand(1)))
+    Cond = Cond.getOperand(0);
+
+  /// Process condition flags and prepare for CTSELECT node creation
+  auto ProcessConditionFlags =
+      [&](SDValue Cond, MVT VT, SDLoc DL, SelectionDAG &DAG,
+          const X86Subtarget &Subtarget) -> std::pair<SDValue, SDValue> {
+    SDValue CC;
+    bool AddTest = true;
+
+    unsigned CondOpcode = Cond.getOpcode();
+    if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) {
+      CC = Cond.getOperand(0);
+      SDValue Cmp = Cond.getOperand(1);
+
+      if ((isX86LogicalCmp(Cmp)) || Cmp.getOpcode() == X86ISD::BT) {
+        Cond = Cmp;
+        AddTest = false;
+      }
+    } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+               CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+               CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
+      SDValue Value;
+      X86::CondCode X86Cond;
+      std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
+      CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
+      AddTest = false;
+    }
+
+    if (AddTest) {
+      // Look past the truncate if the high bits are known zero
+      if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
+
+      // Try to match AND to BT instruction
+      if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+        X86::CondCode X86CondCode;
+        if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
+          CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
+          Cond = BT;
+          AddTest = false;
+        }
+      }
+    }
+
+    if (AddTest) {
+      CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+      Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
+    }
+
+    return {CC, Cond};
+  };
+
+  // Process condition flags and prepare for CTSELECT
+  auto [CC, ProcessedCond] =
+      ProcessConditionFlags(Cond, VT, DL, DAG, Subtarget);
+
+  // Handle i8 CTSELECT with truncate optimization
+  if (Op.getValueType() == MVT::i8 && TrueOp.getOpcode() == ISD::TRUNCATE &&
+      FalseOp.getOpcode() == ISD::TRUNCATE) {
+    SDValue T1 = TrueOp.getOperand(0), T2 = FalseOp.getOperand(0);
+    if (T1.getValueType() == T2.getValueType() &&
+        T1.getOpcode() != ISD::CopyFromReg &&
+        T2.getOpcode() != ISD::CopyFromReg) {
+      SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, T1.getValueType(),
+                                     T2, T1, CC, ProcessedCond);
+      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect);
+    }
+  }
+
+  // Promote small integer types to avoid partial register stalls
+  // Exception: For i8 without CMOV, we can generate a shorter instruction
+  // sequence without movzx so keep it as is.
+  if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMOV()) ||
+      (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(TrueOp, Subtarget) &&
+       !X86::mayFoldLoad(FalseOp, Subtarget))) {
+    TrueOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, TrueOp);
+    FalseOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, FalseOp);
+    SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+    SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Ops);
+    return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), CtSelect);
+  }
+
+  if (isScalarFPTypeInSSEReg(VT)) {
+    MVT IntVT = (VT == MVT::f32) ? MVT::i32 : MVT::i64;
+    TrueOp = DAG.getBitcast(IntVT, TrueOp);
+    FalseOp = DAG.getBitcast(IntVT, FalseOp);
+    SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+    SDValue CtSelect = DAG.getNode(X86ISD::CTSELECT, DL, IntVT, Ops);
+    return DAG.getBitcast(VT, CtSelect);
+  }
+
+  // Create final CTSELECT node
+  SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
+  return DAG.getNode(X86ISD::CTSELECT, DL, Op.getValueType(), Ops,
+                     Op->getFlags());
+}
+
 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   SDValue In = Op->getOperand(0);
@@ -29695,30 +29909,65 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
                                      const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG,
                                      SDValue *Low = nullptr) {
+  unsigned NumElts = VT.getVectorNumElements();
+
   // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
   // to a vXi16 type. Do the multiplies, shift the results and pack the half
   // lane results back together.
 
   // We'll take different approaches for signed and unsigned.
-  // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
-  // words and use pmullw to calculate the full 16-bit product.
+  // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
+  // and use pmullw to calculate the full 16-bit product.
   // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
   // shift them left into the upper byte of each word. This allows us to use
   // pmulhw to calculate the full 16-bit product. This trick means we don't
   // need to sign extend the bytes to use pmullw.
-  MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
+
+  MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
   SDValue Zero = DAG.getConstant(0, dl, VT);
 
-  SDValue ALo, AHi, BLo, BHi;
+  SDValue ALo, AHi;
   if (IsSigned) {
     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
-    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
-    BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
   } else {
     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
-    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
+  }
+
+  SDValue BLo, BHi;
+  if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
+    // If the RHS is a constant, manually unpackl/unpackh and extend.
+    SmallVector<SDValue, 16> LoOps, HiOps;
+    for (unsigned i = 0; i != NumElts; i += 16) {
+      for (unsigned j = 0; j != 8; ++j) {
+        SDValue LoOp = B.getOperand(i + j);
+        SDValue HiOp = B.getOperand(i + j + 8);
+
+        if (IsSigned) {
+          LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
+          HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
+          LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
+                             DAG.getConstant(8, dl, MVT::i16));
+          HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
+                             DAG.getConstant(8, dl, MVT::i16));
+        } else {
+          LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
+          HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
+        }
+
+        LoOps.push_back(LoOp);
+        HiOps.push_back(HiOp);
+      }
+    }
+
+    BLo = DAG.getBuildVector(ExVT, dl, LoOps);
+    BHi = DAG.getBuildVector(ExVT, dl, HiOps);
+  } else if (IsSigned) {
+    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
+    BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
+  } else {
+    BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
   }
 
@@ -29731,7 +29980,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
   if (Low)
     *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
 
-  return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
+  return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
 }
 
 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
@@ -33594,6 +33843,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);
   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
+  case ISD::CTSELECT:           return LowerCTSELECT(Op, DAG);
   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
@@ -33677,6 +33927,12 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
+bool X86TargetLowering::isSelectSupported(SelectSupportKind Kind) const {
+  if (Kind == SelectSupportKind::CtSelect) {
+    return true;
+  }
+  return TargetLoweringBase::isSelectSupported(Kind);
+}
 /// Replace a node with an illegal result type with a new node built out of
 /// custom code.
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
@@ -34904,6 +35160,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(STRICT_CMPM)
   NODE_NAME_CASE(CMPMM_SAE)
   NODE_NAME_CASE(SETCC)
+  NODE_NAME_CASE(CTSELECT)
   NODE_NAME_CASE(SETCC_CARRY)
   NODE_NAME_CASE(FSETCC)
   NODE_NAME_CASE(FSETCCM)
@@ -37677,6 +37934,480 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
   return BB;
 }
 
+/// Helper function to emit i386 CTSELECT with condition materialization.
+/// This converts EFLAGS-based CTSELECT into a condition byte that can be
+/// shared across multiple operations (critical for i64 type legalization).
+///
+/// Phase 1: Materialize condition byte from EFLAGS using SETCC
+/// Phase 2: Create internal pseudo with condition byte for post-RA expansion
+///
+/// This approach ensures that when i64 is type-legalized into two i32
+/// operations, both operations share the same condition byte rather than
+/// each independently reading (and destroying) EFLAGS.
+static MachineBasicBlock *
+emitCTSelectI386WithConditionMaterialization(MachineInstr &MI,
+                                              MachineBasicBlock *BB,
+                                              unsigned InternalPseudoOpcode) {
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+  const MIMetadata MIMD(MI);
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Original pseudo operands: (outs dst), (ins src1, src2, cond)
+  Register Src1Reg = MI.getOperand(1).getReg();
+  Register Src2Reg = MI.getOperand(2).getReg();
+  X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(3).getImm());
+
+  // Get opposite condition (SETCC sets to 1 when condition is TRUE,
+  // but we want to select src1 when condition is FALSE for X86 semantics)
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+  // Step 1: Materialize condition byte from EFLAGS
+  // This is done OUTSIDE the constant-time bundle, before any EFLAGS corruption
+  Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+  BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC);
+
+  // Step 2: Create internal pseudo that takes condition byte as input
+  // This pseudo will be expanded post-RA into the actual constant-time bundle
+  // The condition byte can now be safely shared between multiple pseudos
+
+  // Internal pseudo has operands: (outs dst, tmp_byte, tmp_mask), (ins src1,
+  // src2, cond_byte)
+  Register DstReg = MI.getOperand(0).getReg();
+
+  // Create virtual registers for the temporary outputs
+  Register TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+  Register TmpMaskReg;
+
+  // Determine the register class for tmp_mask based on the data type
+  if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR8rr) {
+    TmpMaskReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+  } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR16rr) {
+    TmpMaskReg = MRI.createVirtualRegister(&X86::GR16RegClass);
+  } else if (InternalPseudoOpcode == X86::CTSELECT_I386_INT_GR32rr) {
+    TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+  } else {
+    llvm_unreachable("Unknown internal pseudo opcode");
+  }
+
+  BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode))
+      .addDef(DstReg)         // dst (output)
+      .addDef(TmpByteReg)     // tmp_byte (output)
+      .addDef(TmpMaskReg)     // tmp_mask (output)
+      .addReg(Src1Reg)        // src1 (input)
+      .addReg(Src2Reg)        // src2 (input)
+      .addReg(CondByteReg);   // pre-materialized condition byte (input)
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+// Helper structure to hold memory operand information for FP loads
+struct FPLoadMemOperands {
+  bool IsValid = false;
+  unsigned BaseReg = 0;
+  int64_t ScaleVal = 1;
+  unsigned IndexReg = 0;
+  int64_t Disp = 0;
+  unsigned SegReg = 0;
+  int FrameIndex = -1;
+  bool IsFrameIndex = false;
+  int ConstantPoolIndex = -1;
+  bool IsConstantPool = false;
+  const GlobalValue *Global = nullptr;
+  int64_t GlobalOffset = 0;
+  bool IsGlobal = false;
+};
+
+// Check if a virtual register is defined by a simple FP load instruction
+// Returns the memory operands if it's a simple load, otherwise returns invalid
+static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
+                                               MachineRegisterInfo &MRI,
+                                               unsigned ExpectedLoadOpcode) {
+  FPLoadMemOperands Result;
+
+  if (!Reg.isVirtual())
+    return Result;
+
+  MachineInstr *DefMI = MRI.getVRegDef(Reg);
+  if (!DefMI)
+    return Result;
+
+  // Check if it's the expected load opcode (e.g., LD_Fp32m, LD_Fp64m, LD_Fp80m)
+  if (DefMI->getOpcode() != ExpectedLoadOpcode)
+    return Result;
+
+  // Check that this is a simple load - not volatile, not atomic, etc.
+  // FP loads have hasSideEffects = 0 in their definition for simple loads
+  if (DefMI->hasOrderedMemoryRef())
+    return Result;
+
+  // The load should have a single def (the destination register) and memory operands
+  // Format: %reg = LD_Fpxxm <fi#N>, 1, %noreg, 0, %noreg
+  // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment
+  if (DefMI->getNumOperands() < 6)
+    return Result;
+
+  // Operand 0 is the destination, operands 1-5 are the memory reference
+  MachineOperand &BaseMO = DefMI->getOperand(1);
+  MachineOperand &ScaleMO = DefMI->getOperand(2);
+  MachineOperand &IndexMO = DefMI->getOperand(3);
+  MachineOperand &DispMO = DefMI->getOperand(4);
+  MachineOperand &SegMO = DefMI->getOperand(5);
+
+  // Check if this is a frame index load
+  if (BaseMO.isFI()) {
+    Result.IsValid = true;
+    Result.IsFrameIndex = true;
+    Result.FrameIndex = BaseMO.getIndex();
+    Result.ScaleVal = ScaleMO.getImm();
+    Result.IndexReg = IndexMO.getReg();
+    Result.Disp = DispMO.getImm();
+    Result.SegReg = SegMO.getReg();
+    return Result;
+  }
+
+  // Check if this is a constant pool load
+  // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg
+  if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
+      ScaleMO.isImm() && IndexMO.isReg() &&
+      IndexMO.getReg() == X86::NoRegister &&
+      DispMO.isCPI() && SegMO.isReg()) {
+    Result.IsValid = true;
+    Result.IsConstantPool = true;
+    Result.ConstantPoolIndex = DispMO.getIndex();
+    Result.ScaleVal = ScaleMO.getImm();
+    Result.IndexReg = IndexMO.getReg();
+    Result.Disp = 0;
+    Result.SegReg = SegMO.getReg();
+    return Result;
+  }
+
+  // Check if this is a global variable load
+  // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg
+  if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
+      ScaleMO.isImm() && IndexMO.isReg() &&
+      IndexMO.getReg() == X86::NoRegister &&
+      DispMO.isGlobal() && SegMO.isReg()) {
+    Result.IsValid = true;
+    Result.IsGlobal = true;
+    Result.Global = DispMO.getGlobal();
+    Result.GlobalOffset = DispMO.getOffset();
+    Result.ScaleVal = ScaleMO.getImm();
+    Result.IndexReg = IndexMO.getReg();
+    Result.Disp = 0;
+    Result.SegReg = SegMO.getReg();
+    return Result;
+  }
+
+  // Regular memory operands (e.g., pointer loads)
+  if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() &&
+      DispMO.isImm() && SegMO.isReg()) {
+    Result.IsValid = true;
+    Result.IsFrameIndex = false;
+    Result.IsConstantPool = false;
+    Result.BaseReg = BaseMO.getReg();
+    Result.ScaleVal = ScaleMO.getImm();
+    Result.IndexReg = IndexMO.getReg();
+    Result.Disp = DispMO.getImm();
+    Result.SegReg = SegMO.getReg();
+    return Result;
+  }
+
+  return Result;
+}
+
+static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
+                                                     MachineBasicBlock *BB,
+                                                     unsigned pseudoInstr) {
+  const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
+  const MIMetadata MIMD(MI);
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  unsigned RegSizeInByte = 4;
+
+  // Get operands
+  // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80, %cond:i8imm
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned FalseReg = MI.getOperand(1).getReg();
+  unsigned TrueReg = MI.getOperand(2).getReg();
+  X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(3).getImm());
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+
+  // Materialize condition byte from EFLAGS
+  Register CondByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+  BuildMI(*BB, MI, MIMD, TII->get(X86::SETCCr), CondByteReg).addImm(OppCC);
+
+  auto storeFpToSlot = [&](unsigned Opcode, int Slot, Register Reg) {
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(Opcode)), Slot)
+        .addReg(Reg, RegState::Kill);
+  };
+
+  // Helper to load integer from memory operands
+  auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps,
+                                     unsigned Offset) -> unsigned {
+    unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg);
+
+    if (MemOps.IsFrameIndex) {
+      // Frame index: addFrameIndex + scale + index + disp + segment
+      MIB.addFrameIndex(MemOps.FrameIndex)
+          .addImm(MemOps.ScaleVal)
+          .addReg(MemOps.IndexReg)
+          .addImm(MemOps.Disp + Offset)
+          .addReg(MemOps.SegReg);
+    } else if (MemOps.IsConstantPool) {
+      // Constant pool: base_reg + scale + index + CP_index + segment
+      // MOV32rm format: base, scale, index, displacement, segment
+      MIB.addReg(X86::NoRegister)  // Base register
+          .addImm(MemOps.ScaleVal)  // Scale
+          .addReg(MemOps.IndexReg)  // Index register
+          .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset)  // Displacement (CP index)
+          .addReg(MemOps.SegReg);  // Segment
+    } else if (MemOps.IsGlobal) {
+      // Global variable: base_reg + scale + index + global + segment
+      // MOV32rm format: base, scale, index, displacement, segment
+      MIB.addReg(X86::NoRegister)  // Base register
+          .addImm(MemOps.ScaleVal)  // Scale
+          .addReg(MemOps.IndexReg)  // Index register
+          .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset)  // Displacement (global address)
+          .addReg(MemOps.SegReg);  // Segment
+    } else {
+      // Regular memory: base_reg + scale + index + disp + segment
+      MIB.addReg(MemOps.BaseReg)
+          .addImm(MemOps.ScaleVal)
+          .addReg(MemOps.IndexReg)
+          .addImm(MemOps.Disp + Offset)
+          .addReg(MemOps.SegReg);
+    }
+
+    return IntReg;
+  };
+
+  // Optimized path: load integers directly from memory when both operands are
+  // memory loads, avoiding FP register round-trip
+  auto emitCtSelectFromMemory = [&](unsigned NumValues,
+                                     const FPLoadMemOperands &TrueMemOps,
+                                     const FPLoadMemOperands &FalseMemOps,
+                                     int ResultSlot) {
+    for (unsigned Val = 0; Val < NumValues; ++Val) {
+      unsigned Offset = Val * RegSizeInByte;
+
+      // Load true and false values directly from their memory locations as integers
+      unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset);
+      unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset);
+
+      // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+      unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+      unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+
+      BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
+          .addDef(ResultIntReg)    // dst (output)
+          .addDef(TmpByteReg)      // tmp_byte (output)
+          .addDef(TmpMaskReg)      // tmp_mask (output)
+          .addReg(FalseIntReg)     // src1 (input) - false value
+          .addReg(TrueIntReg)      // src2 (input) - true value
+          .addReg(CondByteReg);    // pre-materialized condition byte (input)
+
+      // Store result back to result slot
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+          .addFrameIndex(ResultSlot)
+          .addImm(1)
+          .addReg(0)
+          .addImm(Offset)
+          .addReg(0)
+          .addReg(ResultIntReg, RegState::Kill);
+    }
+  };
+
+  auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) {
+    for (unsigned Val = 0; Val < NumValues; ++Val) {
+      unsigned Offset = Val * RegSizeInByte;
+      
+      // Load true and false values from stack as 32-bit integers
+      unsigned TrueIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueIntReg)
+          .addFrameIndex(TrueSlot)
+          .addImm(1)
+          .addReg(0)
+          .addImm(Offset)
+          .addReg(0);
+
+      unsigned FalseIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), FalseIntReg)
+          .addFrameIndex(FalseSlot)
+          .addImm(1)
+          .addReg(0)
+          .addImm(Offset)
+          .addReg(0);
+
+      // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+      unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+      unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+      
+      BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
+          .addDef(ResultIntReg)     // dst (output)
+          .addDef(TmpByteReg)       // tmp_byte (output)
+          .addDef(TmpMaskReg)       // tmp_mask (output)
+          .addReg(FalseIntReg)      // src1 (input) - false value
+          .addReg(TrueIntReg)       // src2 (input) - true value
+          .addReg(CondByteReg);     // pre-materialized condition byte (input)
+
+      // Store result back to result slot
+      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+          .addFrameIndex(ResultSlot)
+          .addImm(1)
+          .addReg(0)
+          .addImm(Offset)
+          .addReg(0)
+          .addReg(ResultIntReg, RegState::Kill);
+    }
+  };
+
+  switch (pseudoInstr) {
+  case X86::CTSELECT_I386_FP32rr: {
+    // Check if both operands are simple memory loads
+    FPLoadMemOperands TrueMemOps =
+        getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp32m);
+    FPLoadMemOperands FalseMemOps =
+        getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp32m);
+
+    int ResultSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+
+    if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+      // Optimized path: load directly from memory as integers
+      // Works for both frame index loads (stack parameters) and
+      // constant pool loads (constants)
+      emitCtSelectFromMemory(1, TrueMemOps, FalseMemOps, ResultSlot);
+
+      // Erase the original FP load instructions since we're not using them
+      // and have loaded the data directly as integers instead
+      if (MRI.hasOneUse(TrueReg)) {
+        if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+          TrueDefMI->eraseFromParent();
+      }
+      if (MRI.hasOneUse(FalseReg)) {
+        if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+          FalseDefMI->eraseFromParent();
+      }
+    } else {
+      // General path: spill FP registers to stack first
+      int TrueSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+      int FalseSlot = MFI.CreateStackObject(RegSizeInByte, Align(4), false);
+
+      storeFpToSlot(X86::ST_Fp32m, TrueSlot, TrueReg);
+      storeFpToSlot(X86::ST_Fp32m, FalseSlot, FalseReg);
+
+      emitCtSelectWithPseudo(1, TrueSlot, FalseSlot, ResultSlot);
+    }
+
+    // Load result back as f32
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp32m), DestReg),
+                      ResultSlot);
+    break;
+  }
+  case X86::CTSELECT_I386_FP64rr: {
+    unsigned StackSlotSize = 8;
+
+    // Check if both operands are simple memory loads
+    FPLoadMemOperands TrueMemOps =
+        getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp64m);
+    FPLoadMemOperands FalseMemOps =
+        getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp64m);
+
+    int ResultSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+
+    if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+      // Optimized path: load directly from memory as integers
+      // Works for both frame index loads (stack parameters) and
+      // constant pool loads (constants)
+      emitCtSelectFromMemory(StackSlotSize / RegSizeInByte, TrueMemOps,
+                             FalseMemOps, ResultSlot);
+
+      // Erase the original FP load instructions since we're not using them
+      if (MRI.hasOneUse(TrueReg)) {
+        if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+          TrueDefMI->eraseFromParent();
+      }
+      if (MRI.hasOneUse(FalseReg)) {
+        if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+          FalseDefMI->eraseFromParent();
+      }
+    } else {
+      // General path: spill FP registers to stack first
+      int TrueSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+      int FalseSlot = MFI.CreateStackObject(StackSlotSize, Align(4), false);
+
+      storeFpToSlot(X86::ST_Fp64m, TrueSlot, TrueReg);
+      storeFpToSlot(X86::ST_Fp64m, FalseSlot, FalseReg);
+
+      emitCtSelectWithPseudo(StackSlotSize / RegSizeInByte, TrueSlot, FalseSlot,
+                             ResultSlot);
+    }
+
+    // Load result back as f64
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp64m), DestReg),
+                      ResultSlot);
+    break;
+  }
+  case X86::CTSELECT_I386_FP80rr: {
+    // f80 is 80 bits (10 bytes), but stored with 12-byte alignment
+    unsigned StackObjectSize = 12;
+
+    // Check if both operands are simple memory loads
+    FPLoadMemOperands TrueMemOps =
+        getFPLoadMemOperands(TrueReg, MRI, X86::LD_Fp80m);
+    FPLoadMemOperands FalseMemOps =
+        getFPLoadMemOperands(FalseReg, MRI, X86::LD_Fp80m);
+
+    int ResultSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+
+    if (TrueMemOps.IsValid && FalseMemOps.IsValid) {
+      // Optimized path: load directly from memory as integers
+      // Works for both frame index loads (stack parameters) and
+      // constant pool loads (constants)
+      emitCtSelectFromMemory(StackObjectSize / RegSizeInByte, TrueMemOps,
+                             FalseMemOps, ResultSlot);
+
+      // Erase the original FP load instructions since we're not using them
+      if (MRI.hasOneUse(TrueReg)) {
+        if (MachineInstr *TrueDefMI = MRI.getVRegDef(TrueReg))
+          TrueDefMI->eraseFromParent();
+      }
+      if (MRI.hasOneUse(FalseReg)) {
+        if (MachineInstr *FalseDefMI = MRI.getVRegDef(FalseReg))
+          FalseDefMI->eraseFromParent();
+      }
+    } else {
+      // General path: spill FP registers to stack first
+      int TrueSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+      int FalseSlot = MFI.CreateStackObject(StackObjectSize, Align(4), false);
+
+      storeFpToSlot(X86::ST_FpP80m, TrueSlot, TrueReg);
+      storeFpToSlot(X86::ST_FpP80m, FalseSlot, FalseReg);
+
+      emitCtSelectWithPseudo(StackObjectSize / RegSizeInByte, TrueSlot,
+                             FalseSlot, ResultSlot);
+    }
+
+    // Load result back as f80
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::LD_Fp80m), DestReg),
+                      ResultSlot);
+    break;
+  }
+  default:
+    llvm_unreachable("Invalid CTSELECT opcode");
+  }
+
+  MI.eraseFromParent();
+
+  return BB;
+}
+
 MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -37734,6 +38465,25 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::CMOV_VK64:
     return EmitLoweredSelect(MI, BB);
 
+  case X86::CTSELECT_I386_GR8rr:
+    return emitCTSelectI386WithConditionMaterialization(
+        MI, BB, X86::CTSELECT_I386_INT_GR8rr);
+
+  case X86::CTSELECT_I386_GR16rr:
+    return emitCTSelectI386WithConditionMaterialization(
+        MI, BB, X86::CTSELECT_I386_INT_GR16rr);
+
+  case X86::CTSELECT_I386_GR32rr:
+    return emitCTSelectI386WithConditionMaterialization(
+        MI, BB, X86::CTSELECT_I386_INT_GR32rr);
+
+  case X86::CTSELECT_I386_FP32rr:
+    return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP32rr);
+  case X86::CTSELECT_I386_FP64rr:
+    return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP64rr);
+  case X86::CTSELECT_I386_FP80rr:
+    return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP80rr);
+    
   case X86::FP80_ADDr:
   case X86::FP80_ADDm32: {
     // Change the floating point control register to use double extended
@@ -41695,7 +42445,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
     if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
         X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
       return SDValue();
-    Imm = llvm::rotl<uint8_t>(Imm, 4);
+    Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
     return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
                        DAG.getTargetConstant(Imm, DL, MVT::i8));
   };
@@ -44662,16 +45412,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
   }
   case X86ISD::PCMPGT:
     // icmp sgt(0, R) == ashr(R, BitWidth-1).
-    if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) {
-      // iff we only need the signbit then we can use R directly.
-      if (OriginalDemandedBits.isSignMask())
-        return TLO.CombineTo(Op, Op.getOperand(1));
-      // otherwise we just need R's signbit for the comparison.
-      APInt SignMask = APInt::getSignMask(BitWidth);
-      if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts,
-                               Known, TLO, Depth + 1))
-        return true;
-    }
+    // iff we only need the sign bit then we can use R directly.
+    if (OriginalDemandedBits.isSignMask() &&
+        ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
+      return TLO.CombineTo(Op, Op.getOperand(1));
     break;
   case X86ISD::MOVMSK: {
     SDValue Src = Op.getOperand(0);
@@ -47581,15 +48325,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
                                                            DL, DAG, Subtarget))
       return V;
 
-  // If the sign bit is known then BLENDV can be folded away.
-  if (N->getOpcode() == X86ISD::BLENDV) {
-    KnownBits KnownCond = DAG.computeKnownBits(Cond);
-    if (KnownCond.isNegative())
-      return LHS;
-    if (KnownCond.isNonNegative())
-      return RHS;
-  }
-
   if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
     SmallVector<int, 64> CondMask;
     if (createShuffleMaskFromVSELECT(CondMask, Cond,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index b7151f65942b4..d759895719388 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -114,6 +114,10 @@ namespace llvm {
     /// X86 Select
     SELECTS,
 
+    /// X86 Constant-time Select, implemented with CMOV instruction. This is
+    /// used to implement constant-time select.
+    CTSELECT,
+
     // Same as SETCC except it's materialized with a sbb and the value is all
     // one's or all zero's.
     SETCC_CARRY, // R = carry_bit ? ~0 : 0
@@ -1139,6 +1143,8 @@ namespace llvm {
     ///
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
+    bool isSelectSupported(SelectSupportKind Kind) const override;
+
     /// Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
@@ -1765,6 +1771,7 @@ namespace llvm {
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 7d5d7cf4a83ab..9c34889f03354 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -106,6 +106,211 @@ let Predicates = [HasCMOV, HasNDD] in {
   def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
             (CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
 }
+
+// Create pseudo instruction and do the pattern matching to them.
+// We use a machine pass to lower these pseudos into cmov, in order
+// to avoid backend optimizations
+let Uses = [EFLAGS], isNotDuplicable = 1, isPseudo = 1 in {
+
+  multiclass CTSELECT<X86TypeInfo t> {
+    // register-only
+    let isCommutable = 0, SchedRW = [WriteCMOV], Predicates = [HasNativeCMOV],
+        AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
+      def rr : PseudoI<(outs t.RegClass:$dst),
+                       (ins t.RegClass:$src1, t.RegClass:$src2, i8imm:$cond),
+                       [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, t.RegClass:$src2, timm:$cond, EFLAGS))]>;
+    }
+
+    // register-memory
+    let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold], Predicates = [HasNativeCMOV],
+        AsmString = "ctselect\\t$dst, $src1, $src2, $cond" in {
+      def rm : PseudoI<(outs t.RegClass:$dst),
+                       (ins t.RegClass:$src1, t.MemOperand:$src2, i8imm:$cond),
+                       [(set t.RegClass:$dst, (X86ctselect t.RegClass:$src1, (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>;
+    }
+  }
+}
+
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+  let Constraints = "$dst = $src1" in {
+    defm CTSELECT16 : CTSELECT<Xi16>;
+    defm CTSELECT32 : CTSELECT<Xi32>;
+    defm CTSELECT64 : CTSELECT<Xi64>;
+  }
+}
+
+// CTSELECT_VEC base class
+class CTSELECT_VEC<RegisterClass VRc, RegisterClass GRc>
+    : PseudoI<
+        (outs VRc:$dst, VRc:$tmpx, GRc:$tmpg),
+        (ins  VRc:$t,   VRc:$f,   i8imm:$cond),
+        []
+      > {
+  let Uses            = [EFLAGS];
+  let isPseudo        = 1;
+  let isNotDuplicable = 1;
+  let hasSideEffects  = 1;
+  let AsmString       = "ctselect\t$dst, $f, $t, $cond";
+  let SchedRW         = [];
+}
+
+// Width-specific class aliases
+class CTSELECT_VEC128  : CTSELECT_VEC<VR128,  GR32>;
+class CTSELECT_VEC128X : CTSELECT_VEC<VR128X, GR32>;
+class CTSELECT_VEC256  : CTSELECT_VEC<VR256,  GR32>;
+class CTSELECT_VEC512  : CTSELECT_VEC<VR512,  GR32>;
+
+
+//===----------------------------------------------------------------------===//
+// 128-bit pseudos (SSE2 baseline; we use PXOR/PAND/MOVD/PSHUFD in the expander)
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE1] in {
+
+  def CTSELECT_V4F32 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+}
+
+let Predicates = [HasSSE2] in {
+
+  def CTSELECT_V2F64 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V4I32 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V2I64 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V8I16 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V16I8 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+
+  // If your build has v8f16, keep this; otherwise comment it out.
+  def CTSELECT_V8F16 : CTSELECT_VEC128 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+}
+
+let Predicates = [HasAVX] in {
+
+  def CTSELECT_V4F32X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V2F64X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V4I32X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V2I64X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V8I16X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V16I8X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+
+  // If your build has v8f16, keep this; otherwise comment it out.
+  def CTSELECT_V8F16X : CTSELECT_VEC128X {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// 256-bit pseudos
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVX] in {
+
+  def CTSELECT_V8F32  : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V4F64  : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V8I32  : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V4I64  : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V16I16 : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+  def CTSELECT_V32I8  : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+
+  // If your build has v16f16, keep this; otherwise comment it out.
+  def CTSELECT_V16F16 : CTSELECT_VEC256 {
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmpx, at earlyclobber $tmpg";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Selection patterns: X86ctselect(...), EFLAGS -> CTSELECT_V*
+//
+// NOTE:
+//  * The SDNode carries Glue from CMP/TEST (due to SDNPInGlue).
+//  * We list EFLAGS explicitly in the pattern (X86 style) to model the arch read.
+//  * Temps (tmpx/tmpy,tmpg) are not in the pattern; they’re outs allocated by RA.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE1] in {
+
+  // 128-bit float (bitwise-equivalent ops in expander)
+  def : Pat<(v4f32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V4F32 VR128:$t, VR128:$f, timm:$cc)>;
+}
+
+let Predicates = [HasSSE2] in {
+
+  // 128-bit integer
+  def : Pat<(v4i32 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V4I32 VR128:$t, VR128:$f, timm:$cc)>;
+  def : Pat<(v2i64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V2I64 VR128:$t, VR128:$f, timm:$cc)>;
+  def : Pat<(v8i16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V8I16 VR128:$t, VR128:$f, timm:$cc)>;
+  def : Pat<(v16i8 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V16I8 VR128:$t, VR128:$f, timm:$cc)>;
+  def : Pat<(v2f64 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V2F64 VR128:$t, VR128:$f, timm:$cc)>;
+
+  // 128-bit f16 (optional)
+  def : Pat<(v8f16 (X86ctselect VR128:$t, VR128:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V8F16 VR128:$t, VR128:$f, timm:$cc)>;
+}
+
+let Predicates = [HasAVX] in {
+
+  // 256-bit integer
+  def : Pat<(v8i32  (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V8I32  VR256:$t, VR256:$f, timm:$cc)>;
+  def : Pat<(v4i64  (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V4I64  VR256:$t, VR256:$f, timm:$cc)>;
+  def : Pat<(v16i16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V16I16 VR256:$t, VR256:$f, timm:$cc)>;
+  def : Pat<(v32i8  (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V32I8  VR256:$t, VR256:$f, timm:$cc)>;
+
+  // 256-bit float (bitwise-equivalent ops in expander)
+  def : Pat<(v8f32 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V8F32 VR256:$t, VR256:$f, timm:$cc)>;
+  def : Pat<(v4f64 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V4F64 VR256:$t, VR256:$f, timm:$cc)>;
+
+  // 256-bit f16 (optional)
+  def : Pat<(v16f16 (X86ctselect VR256:$t, VR256:$f, (i8 timm:$cc), EFLAGS)),
+            (CTSELECT_V16F16 VR256:$t, VR256:$f, timm:$cc)>;
+}
+
 let Predicates = [HasCMOV, HasCF] in {
   def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS),
             (CFCMOV16rr GR16:$src1, (inv_cond_XFORM timm:$cond))>;
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ec31675731b79..d40c91b52c808 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -693,6 +693,87 @@ def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
 def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
           (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
 
+// CTSELECT
+// Enhanced CTSELECT pseudos for i386 with temporary register allocation
+// These use a two-phase approach:
+// 1. Custom inserter materializes condition byte from EFLAGS
+// 2. Post-RA expansion generates constant-time instruction bundles
+
+let isPseudo = 1, isNotDuplicable = 1 in {
+  // Phase 1: Initial pseudos that consume EFLAGS (via custom inserter)
+  // These are matched by patterns and convert EFLAGS to condition byte
+  class CTSELECT_I386_INITIAL<RegisterClass RC, ValueType VT>
+      : PseudoI<(outs RC:$dst),
+                (ins RC:$src1, RC:$src2, i8imm:$cond),
+                [(set RC:$dst, (VT(X86ctselect RC:$src1, RC:$src2, timm:$cond,
+                                        EFLAGS)))]> {
+    let Uses = [EFLAGS];
+    let Defs = [EFLAGS];
+    let usesCustomInserter = 1;
+    let hasNoSchedulingInfo = 1;
+  }
+
+  // Phase 2: Internal pseudos with pre-materialized condition byte (post-RA expansion)
+  // These generate the actual constant-time instruction bundles
+  class CTSELECT_I386_INTERNAL<RegisterClass RC, RegisterClass ByteRC>
+      : PseudoI<(outs RC:$dst, ByteRC:$tmp_byte, RC:$tmp_mask),
+                (ins RC:$src1, RC:$src2, ByteRC:$cond_byte), []> {
+    let hasNoSchedulingInfo = 1;
+    let Constraints = "@earlyclobber $dst, at earlyclobber $tmp_byte, at earlyclobber $tmp_mask";
+    let Defs = [EFLAGS];  // NEG instruction in post-RA expansion clobbers EFLAGS
+  }
+}
+
+// Phase 1 pseudos for non-CMOV targets (custom inserter materializes condition)
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+  let Predicates = [NoNativeCMOV] in {
+    def CTSELECT_I386_GR8rr : CTSELECT_I386_INITIAL<GR8, i8>;
+    def CTSELECT_I386_GR16rr : CTSELECT_I386_INITIAL<GR16, i16>;
+    def CTSELECT_I386_GR32rr : CTSELECT_I386_INITIAL<GR32, i32>;
+  }
+}
+
+// Phase 2 pseudos (post-RA expansion with pre-materialized condition byte)
+let isCodeGenOnly = 1, hasSideEffects = 1, ForceDisassemble = 1 in {
+  let Predicates = [NoNativeCMOV] in {
+    def CTSELECT_I386_INT_GR8rr :
+        CTSELECT_I386_INTERNAL<GR8, GR8>;
+    def CTSELECT_I386_INT_GR16rr :
+        CTSELECT_I386_INTERNAL<GR16, GR8>;
+    def CTSELECT_I386_INT_GR32rr :
+        CTSELECT_I386_INTERNAL<GR32, GR8>;
+  }
+}
+
+let hasSideEffects = 1,
+    ForceDisassemble = 1,
+    Constraints = "$dst = $src1" in {
+
+  let Predicates = [FPStackf32] in
+    def CTSELECT_I386_FP32rr : CTSELECT_I386_INITIAL<RFP32, f32>;
+
+  let Predicates = [FPStackf64] in
+    def CTSELECT_I386_FP64rr : CTSELECT_I386_INITIAL<RFP64, f64>;
+
+  def CTSELECT_I386_FP80rr : CTSELECT_I386_INITIAL<RFP80, f80>;
+}
+
+// Pattern matching for non-native-CMOV CTSELECT (routes to custom inserter for condition materialization)
+// NoNativeCMOV ensures these patterns are used when actual CMOV instruction is not available
+// even if canUseCMOV() is true (e.g., i386 with SSE which can emulate CMOV)
+let Predicates = [NoNativeCMOV] in {
+  def : Pat<(i8(X86ctselect GR8:$src1, GR8:$src2, timm:$cond, EFLAGS)),
+            (CTSELECT_I386_GR8rr GR8:$src1, GR8:$src2, timm:$cond)>;
+
+  def : Pat<(i16(X86ctselect GR16:$src1, GR16:$src2, timm:$cond, EFLAGS)),
+            (CTSELECT_I386_GR16rr GR16:$src1, GR16:$src2, timm:$cond)>;
+
+  def : Pat<(i32(X86ctselect GR32:$src1, GR32:$src2, timm:$cond, EFLAGS)),
+            (CTSELECT_I386_GR32rr GR32:$src1, GR32:$src2, timm:$cond)>;
+
+  // i64 patterns handled automatically by type legalization
+}
+
 //===----------------------------------------------------------------------===//
 // Normal-Instructions-With-Lock-Prefix Pseudo Instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index 116986a0fffea..4c9e5bae3b46c 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -28,6 +28,10 @@ def SDTX86Cmov    : SDTypeProfile<1, 4,
                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
                                    SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
 
+def SDTX86CtSelect : SDTypeProfile<1, 4,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                   SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
 // Unary and binary operator instructions that set EFLAGS as a side-effect.
 def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
                                            [SDTCisSameAs<0, 2>,
@@ -151,6 +155,7 @@ def X86ctest   : SDNode<"X86ISD::CTEST",    SDTX86Ccmp>;
 def X86cload    : SDNode<"X86ISD::CLOAD",   SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def X86cstore   : SDNode<"X86ISD::CSTORE",  SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+def X86ctselect: SDNode<"X86ISD::CTSELECT", SDTX86CtSelect, [SDNPInGlue]>;
 def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov>;
 def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
                         [SDNPHasChain]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 6b2a7a4ec3583..765db86ffafb3 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -475,6 +475,556 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
   return false;
 }
 
+struct CtSelectInstructions {
+  unsigned PAndOpc;
+  unsigned PAndnOpc;
+  unsigned POrOpc;
+  unsigned BroadcastOpc;
+  unsigned IntMoveOpc;
+  unsigned MoveOpc;
+  bool Use256;
+  bool UseBlendInstr;
+};
+
+static CtSelectInstructions
+getCtSelectInstructions(unsigned Opcode, const X86Subtarget &Subtarget) {
+  CtSelectInstructions Instructions = {};
+
+  switch (Opcode) {
+  case X86::CTSELECT_V2F64:
+    if (Subtarget.hasSSE2()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVAPDrr;
+      Instructions.UseBlendInstr = true;
+    } else {
+      llvm_unreachable("Double precision vectors require SSE2");
+    }
+    break;
+  case X86::CTSELECT_V4F32:
+    if (Subtarget.hasSSE41()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVAPSrr;
+      Instructions.UseBlendInstr = true;
+    } else if (Subtarget.hasSSE2()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVAPSrr;
+    } else {
+      // fallback to SSE1, only support four 32-bit single precision
+      // floating-point values
+      Instructions.PAndOpc = X86::ANDPSrr;
+      Instructions.PAndnOpc = X86::ANDNPSrr;
+      Instructions.POrOpc = X86::ORPSrr;
+      Instructions.BroadcastOpc = X86::SHUFPSrri;
+      Instructions.IntMoveOpc = X86::MOVSS2DIrr;
+      Instructions.MoveOpc = X86::MOVAPSrr;
+    }
+    break;
+  case X86::CTSELECT_V4I32:
+  case X86::CTSELECT_V2I64:
+  case X86::CTSELECT_V8I16:
+  case X86::CTSELECT_V16I8:
+    if (Subtarget.hasSSE2()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVDQArr;
+    } else {
+      llvm_unreachable("Integer vector operations require SSE2");
+    }
+    break;
+  case X86::CTSELECT_V8F16:
+    if (Subtarget.hasSSE2()) {
+      Instructions.PAndOpc = X86::PANDrr;
+      Instructions.PAndnOpc = X86::PANDNrr;
+      Instructions.POrOpc = X86::PORrr;
+      Instructions.BroadcastOpc = X86::PSHUFDri;
+      Instructions.IntMoveOpc = X86::MOVDI2PDIrr;
+      Instructions.MoveOpc = X86::MOVDQArr;
+    } else {
+      llvm_unreachable("FP16 vector operations require SSE2");
+    }
+    break;
+  case X86::CTSELECT_V4F32X:
+  case X86::CTSELECT_V4I32X:
+  case X86::CTSELECT_V2F64X:
+  case X86::CTSELECT_V2I64X:
+  case X86::CTSELECT_V8I16X:
+  case X86::CTSELECT_V16I8X:
+  case X86::CTSELECT_V8F16X:
+    if (Subtarget.hasAVX()) {
+      Instructions.PAndOpc = X86::VPANDrr;
+      Instructions.PAndnOpc = X86::VPANDNrr;
+      Instructions.POrOpc = X86::VPORrr;
+      Instructions.BroadcastOpc = X86::VPSHUFDri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc = (Opcode == X86::CTSELECT_V4F32X) ? X86::VMOVAPSrr
+                             : (Opcode == X86::CTSELECT_V2F64X)
+                                 ? X86::VMOVAPDrr
+                                 : X86::VMOVDQArr;
+    } else {
+      llvm_unreachable("AVX variants require AVX support");
+    }
+    break;
+  case X86::CTSELECT_V8F32:
+  case X86::CTSELECT_V8I32:
+    if (Subtarget.hasAVX()) {
+      Instructions.PAndOpc = X86::VPANDYrr;
+      Instructions.PAndnOpc = X86::VPANDNYrr;
+      Instructions.POrOpc = X86::VPORYrr;
+      Instructions.BroadcastOpc = X86::VPERMILPSYri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc =
+          (Opcode == X86::CTSELECT_V8F32) ? X86::VMOVAPSYrr : X86::VMOVDQAYrr;
+      Instructions.Use256 = true;
+    } else {
+      llvm_unreachable("256-bit vectors require AVX");
+    }
+    break;
+  case X86::CTSELECT_V4F64:
+  case X86::CTSELECT_V4I64:
+    if (Subtarget.hasAVX()) {
+      Instructions.PAndOpc = X86::VPANDYrr;
+      Instructions.PAndnOpc = X86::VPANDNYrr;
+      Instructions.POrOpc = X86::VPORYrr;
+      Instructions.BroadcastOpc = X86::VPERMILPDYri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc =
+          (Opcode == X86::CTSELECT_V4F64) ? X86::VMOVAPDYrr : X86::VMOVDQAYrr;
+      Instructions.Use256 = true;
+    } else {
+      llvm_unreachable("256-bit vectors require AVX");
+    }
+    break;
+  case X86::CTSELECT_V16I16:
+  case X86::CTSELECT_V32I8:
+  case X86::CTSELECT_V16F16:
+    if (Subtarget.hasAVX2()) {
+      Instructions.PAndOpc = X86::VPANDYrr;
+      Instructions.PAndnOpc = X86::VPANDNYrr;
+      Instructions.POrOpc = X86::VPORYrr;
+      Instructions.BroadcastOpc = X86::VPERMILPSYri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc = X86::VMOVDQAYrr;
+      Instructions.Use256 = true;
+    } else if (Subtarget.hasAVX()) {
+      Instructions.PAndOpc = X86::VPANDYrr;
+      Instructions.PAndnOpc = X86::VPANDNYrr;
+      Instructions.POrOpc = X86::VPORYrr;
+      Instructions.BroadcastOpc = X86::VPERMILPSYri;
+      Instructions.IntMoveOpc = X86::VMOVDI2PDIrr;
+      Instructions.MoveOpc = X86::VMOVDQAYrr;
+      Instructions.Use256 = true;
+    } else {
+      llvm_unreachable("256-bit integer vectors require AVX");
+    }
+    break;
+  default:
+    llvm_unreachable("Unexpected CTSELECT opcode");
+  }
+
+  return Instructions;
+}
+
+bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  const DebugLoc &DL = MI.getDebugLoc();
+  auto Instruction = getCtSelectInstructions(Opcode, Subtarget);
+
+  MachineBasicBlock *MBB = MI.getParent();
+
+  // Operand layout matches the TableGen definition:
+  // (outs VR128:$dst, VR128:$tmpx, GR32:$tmpg),
+  // (ins  VR128:$t, VR128:$f, i8imm:$cond)
+  Register Dst = MI.getOperand(0).getReg();
+  Register MaskReg = MI.getOperand(1).getReg();  // vector mask temp
+  Register TmpGPR = MI.getOperand(2).getReg();   // scalar mask temp (GPR32)
+  Register FalseVal = MI.getOperand(3).getReg(); // true_value
+  Register TrueVal = MI.getOperand(4).getReg();  // false_value
+  X86::CondCode CC = X86::CondCode(MI.getOperand(5).getImm()); // condition
+
+  MachineInstr *FirstInstr = nullptr;
+  MachineInstr *LastInstr = nullptr;
+  auto recordInstr = [&](MachineInstrBuilder MIB) {
+    MachineInstr *NewMI = MIB.getInstr();
+    LastInstr = NewMI;
+    if (!FirstInstr)
+      FirstInstr = NewMI;
+  };
+
+  // Create scalar mask in tempGPR and broadcast to vector mask
+  recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOV32ri), TmpGPR)
+                  .addImm(0)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  auto SubReg = TRI->getSubReg(TmpGPR, X86::sub_8bit);
+  recordInstr(BuildMI(*MBB, MI, DL, get(X86::SETCCr))
+                  .addReg(SubReg)
+                  .addImm(CC)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  // Zero-extend byte to 32-bit register (movzbl %al, %eax)
+  recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVZX32rr8), TmpGPR)
+                  .addReg(SubReg)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) {
+    // Shift left 31 bits to convert 1 -> 0x80000000, 0 -> 0x00000000 (shll $31,
+    // %eax)
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::SHL32ri), TmpGPR)
+                    .addReg(TmpGPR)
+                    .addImm(31));
+  } else {
+    // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax)
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR)
+                    .addReg(TmpGPR));
+  }
+
+  // Broadcast to TmpX (vector mask)
+  recordInstr(BuildMI(*MBB, MI, DL, get(X86::PXORrr), MaskReg)
+                  .addReg(MaskReg)
+                  .addReg(MaskReg)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  // Move scalar mask to vector register
+  recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.IntMoveOpc), MaskReg)
+                  .addReg(TmpGPR)
+                  .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+  if (Instruction.Use256) {
+    // Broadcast to 256-bit vector register
+    recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+                    .addReg(MaskReg)
+                    .addImm(0)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+  } else {
+    if (Subtarget.hasSSE2() || Subtarget.hasAVX()) {
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+                      .addReg(MaskReg)
+                      .addImm(0x00)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+    } else {
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.BroadcastOpc), MaskReg)
+                      .addReg(MaskReg)
+                      .addReg(MaskReg)
+                      .addImm(0x00)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+    }
+  }
+
+  if (Instruction.UseBlendInstr && Subtarget.hasSSE41()) {
+    // Use dedicated blend instructions for SSE4.1+
+    unsigned BlendOpc;
+    switch (Opcode) {
+    case X86::CTSELECT_V4F32:
+      BlendOpc = X86::BLENDVPSrr0;
+      break;
+    case X86::CTSELECT_V2F64:
+      BlendOpc = X86::BLENDVPDrr0;
+      break;
+    default:
+      // alias for pblendvb that takes xmm0 as implicit mask register
+      BlendOpc = X86::PBLENDVBrr0;
+      break;
+    }
+
+    // Check if XMM0 is used as one of source registers, if yes then save it
+    // in Dst register and update FalseVal and TrueVal to Dst register
+    bool DidSaveXMM0 = false;
+    Register SavedXMM0 = X86::XMM0;
+    if (FalseVal == X86::XMM0 || TrueVal == X86::XMM0) {
+      Register SrcXMM0 = (FalseVal == X86::XMM0) ? FalseVal : TrueVal;
+
+      // if XMM0 is one of the source registers, it will not match with Dst
+      // registers, so we need to move it to Dst register
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                      .addReg(SrcXMM0)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+      // update FalseVal and TrueVal to Dst register
+      if (FalseVal == X86::XMM0)
+        FalseVal = Dst;
+      if (TrueVal == X86::XMM0)
+        TrueVal = Dst;
+
+      // update SavedXMM0 to Dst register
+      SavedXMM0 = Dst;
+
+      // set DidSaveXMM0 to true to indicate that we saved XMM0 into Dst
+      // register
+      DidSaveXMM0 = true;
+    } else if (MaskReg != X86::XMM0 && Dst != X86::XMM0) {
+
+      // if XMM0 is not allocated for any of the register, we stil need to save
+      // and restore it after using as mask register
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                      .addReg(X86::XMM0)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+      SavedXMM0 = Dst;
+      DidSaveXMM0 = true;
+    }
+
+    if (MaskReg != X86::XMM0) {
+      // BLENDV uses XMM0 as implicit mask register
+      // https://www.felixcloutier.com/x86/pblendvb
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0)
+                      .addReg(MaskReg)
+                      .setMIFlag(MachineInstr::MIFlag::NoMerge));
+
+      // move FalseVal to mask (use MaskReg as the dst of the blend)
+      recordInstr(BuildMI(*MBB, MI, DL, get(X86::MOVAPSrr), MaskReg)
+                      .addReg(FalseVal)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+      // MaskReg := blend(MaskReg /*false*/, TrueVal /*true*/)  ; mask in
+      // xmm0
+      recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), MaskReg)
+                      .addReg(MaskReg)
+                      .addReg(TrueVal)
+                      .addReg(X86::XMM0)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+      // restore XMM0 from SavedXMM0 if we saved it into Dst
+      if (DidSaveXMM0) {
+        recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), X86::XMM0)
+                        .addReg(SavedXMM0)
+                        .setMIFlags(MachineInstr::MIFlag::NoMerge));
+      }
+      // dst = result (now in MaskReg)
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                      .addReg(MaskReg)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+    } else {
+      // move FalseVal to Dst register since MaskReg is XMM0 and Dst is not
+      recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                      .addReg(FalseVal)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+      // Dst := blend(Dst /*false*/, TrueVal /*true*/)  ; mask in
+      // xmm0
+      recordInstr(BuildMI(*MBB, MI, DL, get(BlendOpc), Dst)
+                      .addReg(Dst)
+                      .addReg(TrueVal)
+                      .addReg(X86::XMM0)
+                      .setMIFlags(MachineInstr::MIFlag::NoMerge));
+    }
+  } else {
+
+    // dst = mask
+    recordInstr(BuildMI(*MBB, MI, DL, get(Instruction.MoveOpc), Dst)
+                    .addReg(MaskReg)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+    // mask &= true_val
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDrr), MaskReg)
+                    .addReg(MaskReg)
+                    .addReg(TrueVal)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+    // dst = ~mask & false_val
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::PANDNrr), Dst)
+                    .addReg(Dst)
+                    .addReg(FalseVal)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+
+    // dst |= mask; (mask & t) | (~mask & f)
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::PORrr), Dst)
+                    .addReg(Dst)
+                    .addReg(MaskReg)
+                    .setMIFlags(MachineInstr::MIFlag::NoMerge));
+  }
+
+  assert(FirstInstr && LastInstr && "Expected at least one expanded instruction");
+  auto BundleEnd = LastInstr->getIterator();
+  finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd));
+
+  MI.eraseFromParent();
+
+  return true;
+}
+
+bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // CTSELECT pseudo has: (outs dst), (ins true_val, false_val, cond)
+  MachineOperand &OperandRes = MI.getOperand(0);  // destination register
+  MachineOperand &OperandTrue = MI.getOperand(1); // true value
+  MachineOperand &OperandCond = MI.getOperand(3); // condition code
+
+  assert(OperandTrue.isReg() && OperandRes.isReg() && OperandCond.isImm() &&
+         "Invalid operand types");
+  assert(OperandTrue.getReg() == OperandRes.getReg() &&
+         "Result register different from True register");
+
+  assert(Subtarget.hasCMOV() && "target does not support CMOV instructions");
+
+  unsigned Opcode = 0;
+
+  switch (MI.getOpcode()) {
+  case X86::CTSELECT16rr:
+    Opcode = X86::CMOV16rr;
+    break;
+  case X86::CTSELECT32rr:
+    Opcode = X86::CMOV32rr;
+    break;
+  case X86::CTSELECT64rr:
+    Opcode = X86::CMOV64rr;
+    break;
+  case X86::CTSELECT16rm:
+    Opcode = X86::CMOV16rm;
+    break;
+  case X86::CTSELECT32rm:
+    Opcode = X86::CMOV32rm;
+    break;
+  case X86::CTSELECT64rm:
+    Opcode = X86::CMOV64rm;
+    break;
+  default:
+    llvm_unreachable("Invalid CTSELECT opcode");
+  }
+
+  if (!Subtarget.hasCMOV()) {
+    llvm_unreachable("target does not support cmov");
+  }
+
+  // Build CMOV instruction: copy the first 3 operands (dst, true, false)
+  // and add condition code
+  MachineInstrBuilder CmovBuilder = BuildMI(*MBB, MI, DL, get(Opcode));
+  for (unsigned i = 0u; i < MI.getNumOperands(); ++i) { // Copy
+    CmovBuilder.add(MI.getOperand(i));
+  }
+
+  // Remove the original CTSELECT instruction
+  MI.eraseFromParent();
+  return true;
+}
+
+/// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time)
+/// These internal pseudos receive a pre-materialized condition byte from the
+/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization.
+bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // CTSELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask),
+  // (ins src1, src2, cond_byte)
+  // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent
+  Register DstReg = MI.getOperand(0).getReg();
+  Register TmpByteReg = MI.getOperand(1).getReg();
+  Register TmpMaskReg = MI.getOperand(2).getReg();
+  Register Src1Reg = MI.getOperand(3).getReg();
+  Register Src2Reg = MI.getOperand(4).getReg();
+  Register CondByteReg = MI.getOperand(5).getReg();  // Pre-materialized condition byte
+
+  // Determine instruction opcodes based on register width
+  unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp;
+  if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) {
+    MovZXOp = 0;  // No zero-extend needed for GR8
+    NegOp = X86::NEG8r;
+    MovOp = X86::MOV8rr;
+    AndOp = X86::AND8rr;
+    NotOp = X86::NOT8r;
+    OrOp = X86::OR8rr;
+  } else if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR16rr) {
+    MovZXOp = X86::MOVZX16rr8;
+    NegOp = X86::NEG16r;
+    MovOp = X86::MOV16rr;
+    AndOp = X86::AND16rr;
+    NotOp = X86::NOT16r;
+    OrOp = X86::OR16rr;
+  } else { // X86::CTSELECT_I386_INT_GR32rr
+    MovZXOp = X86::MOVZX32rr8;
+    NegOp = X86::NEG32r;
+    MovOp = X86::MOV32rr;
+    AndOp = X86::AND32rr;
+    NotOp = X86::NOT32r;
+    OrOp = X86::OR32rr;
+  }
+
+  // 7-instruction constant-time selection bundle (no SETCC inside):
+  // result = (true_val & mask) | (false_val & ~mask)
+  // The condition byte is already materialized, avoiding EFLAGS dependency
+
+  // Step 1: Copy pre-materialized condition byte to TmpByteReg
+  // This allows the bundle to work with allocated temporaries
+  auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg)
+      .addReg(CondByteReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+  auto BundleStart = I1->getIterator();
+
+  // Step 2: Zero-extend condition byte to register width (0 or 1)
+  if (MI.getOpcode() != X86::CTSELECT_I386_INT_GR8rr) {
+    BuildMI(*MBB, MI, DL, get(MovZXOp), TmpMaskReg)
+        .addReg(TmpByteReg)
+        .setMIFlag(MachineInstr::MIFlag::NoMerge);
+  }
+
+  // Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...)
+  Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg;
+  BuildMI(*MBB, MI, DL, get(NegOp), MaskReg)
+      .addReg(MaskReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Step 4,5: Apply mask to true value - copy src1 to dest, then AND with mask
+  BuildMI(*MBB, MI, DL, get(MovOp), DstReg)
+      .addReg(Src1Reg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  BuildMI(*MBB, MI, DL, get(AndOp), DstReg)
+      .addReg(DstReg)
+      .addReg(MaskReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Step 6: Create inverted mask inline (~mask)
+  BuildMI(*MBB, MI, DL, get(NotOp), MaskReg)
+      .addReg(MaskReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Step 7: Apply inverted mask to false value - reuse mask register directly
+  BuildMI(*MBB, MI, DL, get(AndOp), MaskReg)
+      .addReg(MaskReg)
+      .addReg(Src2Reg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Step 8: Final result: (src1 & mask) | (src2 & ~mask)
+  auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg)
+      .addReg(DstReg)
+      .addReg(MaskReg)
+      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+
+  // Bundle all generated instructions for atomic execution before removing MI
+  auto BundleEnd = std::next(LI->getIterator());
+  if (BundleStart != BundleEnd) {
+    // Only bundle if we have multiple instructions
+    finalizeBundle(*MBB, BundleStart, BundleEnd);
+  }
+
+  // TODO: Optimization opportunity - The register allocator may choose callee-saved
+  // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary
+  // save/restore overhead. Consider constraining these to caller-saved register
+  // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve
+  // constant-time performance by eliminating prologue/epilogue instructions.
+
+  // Remove the original pseudo instruction
+  MI.eraseFromParent();
+  return true;
+}
+
 static bool isFrameLoadOpcode(int Opcode, TypeSize &MemBytes) {
   switch (Opcode) {
   default:
@@ -6402,6 +6952,43 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::ADD64ri32_DB:
     MIB->setDesc(get(X86::OR64ri32));
     break;
+
+  case X86::CTSELECT64rr:
+  case X86::CTSELECT32rr:
+  case X86::CTSELECT16rr:
+  case X86::CTSELECT64rm:
+  case X86::CTSELECT32rm:
+  case X86::CTSELECT16rm:
+    // These CTSELECT pseudos are only selected when CMOV is available
+    // Pattern matching ensures we use CTSELECT_I386 when CMOV is not available
+    return expandCtSelectWithCMOV(MI);
+
+  // non-cmov CTSELECT expansion (post-RA, constant-time)
+  // These are the internal pseudos with pre-materialized condition byte
+  case X86::CTSELECT_I386_INT_GR8rr:
+  case X86::CTSELECT_I386_INT_GR16rr:
+  case X86::CTSELECT_I386_INT_GR32rr:
+    return expandCtSelectIntWithoutCMOV(MI);
+
+  case X86::CTSELECT_V2F64:
+  case X86::CTSELECT_V4F32:
+  case X86::CTSELECT_V2I64:
+  case X86::CTSELECT_V4I32:
+  case X86::CTSELECT_V8I16:
+  case X86::CTSELECT_V16I8:
+  case X86::CTSELECT_V2F64X:
+  case X86::CTSELECT_V4F32X:
+  case X86::CTSELECT_V2I64X:
+  case X86::CTSELECT_V4I32X:
+  case X86::CTSELECT_V8I16X:
+  case X86::CTSELECT_V16I8X:
+  case X86::CTSELECT_V4I64:
+  case X86::CTSELECT_V8I32:
+  case X86::CTSELECT_V16I16:
+  case X86::CTSELECT_V32I8:
+  case X86::CTSELECT_V4F64:
+  case X86::CTSELECT_V8F32:
+    return expandCtSelectVector(MI);
   }
   return false;
 }
@@ -10800,27 +11387,39 @@ void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
     if (!ST.hasSSE1())
       return;
 
-    BuildMI(MBB, Iter, DL, get(X86::V_SET0), Reg);
+    // PXOR is safe to use because it doesn't affect flags.
+    BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
   } else if (X86::VR256RegClass.contains(Reg)) {
     // YMM#
     if (!ST.hasAVX())
       return;
 
-    BuildMI(MBB, Iter, DL, get(X86::AVX_SET0), Reg);
+    // VPXOR is safe to use because it doesn't affect flags.
+    BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
   } else if (X86::VR512RegClass.contains(Reg)) {
     // ZMM#
     if (!ST.hasAVX512())
       return;
 
-    BuildMI(MBB, Iter, DL, get(X86::AVX512_512_SET0), Reg);
+    // VPXORY is safe to use because it doesn't affect flags.
+    BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
   } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
              X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
              X86::VK16RegClass.contains(Reg)) {
     if (!ST.hasVLX())
       return;
 
-    unsigned Op = ST.hasBWI() ? X86::KSET0Q : X86::KSET0W;
-    BuildMI(MBB, Iter, DL, get(Op), Reg);
+    // KXOR is safe to use because it doesn't affect flags.
+    unsigned Op = ST.hasBWI() ? X86::KXORQkk : X86::KXORWkk;
+    BuildMI(MBB, Iter, DL, get(Op), Reg)
+        .addReg(Reg, RegState::Undef)
+        .addReg(Reg, RegState::Undef);
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 5f75559bd9598..ebd7e070d5fe8 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -724,6 +724,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
   bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
                       int &FrameIndex) const;
 
+  /// Expand the CTSELECT pseudo-instructions.
+  bool expandCtSelectWithCMOV(MachineInstr &MI) const;
+  bool expandCtSelectIntWithoutCMOV(MachineInstr &MI) const;
+
+  bool expandCtSelectVector(MachineInstr &MI) const;
+
   /// Returns true iff the routine could find two commutable operands in the
   /// given machine instruction with 3 vector inputs.
   /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 98104a6fad1a9..6b585a5b0b436 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -49,6 +49,11 @@ def HasZU        : Predicate<"Subtarget->hasZU()">;
 def HasCF        : Predicate<"Subtarget->hasCF()">;
 def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">;
 def NoCMOV       : Predicate<"!Subtarget->canUseCMOV()">;
+// Predicates for native CMOV instruction (checks hasCMOV(), not canUseCMOV())
+// HasCMOV may be true even without native CMOV (e.g., via SSE emulation)
+// Use HasNativeCMOV/NoNativeCMOV for constant-time code that requires actual CMOV
+def HasNativeCMOV : Predicate<"Subtarget->hasCMOV()">;
+def NoNativeCMOV  : Predicate<"!Subtarget->hasCMOV()">;
 def HasNOPL      : Predicate<"Subtarget->hasNOPL()">;
 def HasMMX       : Predicate<"Subtarget->hasMMX()">;
 def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 9a76abcd351bf..66c9d75053640 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -617,10 +617,11 @@ void X86PassConfig::addPreEmitPass2() {
     // ObjC runtime functions present in the module.
     const Function &F = MF.getFunction();
     const Module *M = F.getParent();
-    return M->getModuleFlag("kcfi") ||
+    return M->getModuleFlag("kcfi") || F.hasFnAttribute("ct-select") ||
            (TT.isOSDarwin() &&
             (M->getFunction("objc_retainAutoreleasedReturnValue") ||
-             M->getFunction("objc_unsafeClaimAutoreleasedReturnValue")));
+             M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) ||
+             F.hasFnAttribute("ct-select");
   }));
 
   // Analyzes and emits pseudos to support Win x64 Unwind V2. This pass must run
diff --git a/llvm/test/CodeGen/X86/ctselect-edge-cases.ll b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll
new file mode 100644
index 0000000000000..0797265972a1f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-edge-cases.ll
@@ -0,0 +1,409 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=X32
+
+; Test ct.select edge cases and corner cases
+
+; Test with very large integers
+define i128 @test_ctselect_i128(i1 %cond, i128 %a, i128 %b) {
+; X64-LABEL: test_ctselect_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    cmovneq %rdx, %r8
+; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i128:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, 12(%eax)
+; X32-NEXT:    movl %edx, 8(%eax)
+; X32-NEXT:    movl %edi, 4(%eax)
+; X32-NEXT:    movl %esi, (%eax)
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl $4
+  %result = call i128 @llvm.ct.select.i128(i1 %cond, i128 %a, i128 %b)
+  ret i128 %result
+}
+
+; Test with small integer types
+define i1 @test_ctselect_i1(i1 %cond, i1 %a, i1 %b) {
+; X64-LABEL: test_ctselect_i1:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_i1:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    # kill: def $al killed $al killed $eax
+; X32-NEXT:    retl
+  %result = call i1 @llvm.ct.select.i1(i1 %cond, i1 %a, i1 %b)
+  ret i1 %result
+}
+
+; Test with extremal values
+define i32 @test_ctselect_extremal_values(i1 %cond) {
+; X64-LABEL: test_ctselect_extremal_values:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_extremal_values:
+; X32:       # %bb.0:
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X32-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    retl
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 2147483647, i32 -2147483648)
+  ret i32 %result
+}
+
+; Test with floating point special values
+define float @test_ctselect_f32_special_values(i1 %cond) {
+; X64-LABEL: test_ctselect_f32_special_values:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    movl $2139095040, %ecx # imm = 0x7F800000
+; X64-NEXT:    cmovnel %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f32_special_values:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    sete %al
+; X32-NEXT:    movl {{\.?LCPI[0-9]+_[0-9]+}}, %ecx
+; X32-NEXT:    movl {{\.?LCPI[0-9]+_[0-9]+}}, %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float 0x7FF8000000000000, float 0x7FF0000000000000)
+  ret float %result
+}
+
+define double @test_ctselect_f64_special_values(i1 %cond) {
+; X64-LABEL: test_ctselect_f64_special_values:
+; X64:       # %bb.0:
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; X64-NEXT:    movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    movq %rcx, %xmm0
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_f64_special_values:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    subl $24, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 36
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X32-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X32-NEXT:    sete %al
+; X32-NEXT:    fxch %st(1)
+; X32-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X32-NEXT:    fstpl (%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    fldl {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $24, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+  %result = call double @llvm.ct.select.f64(i1 %cond, double 0x7FF8000000000000, double 0x7FF0000000000000)
+  ret double %result
+}
+
+; Test with null pointers
+define ptr @test_ctselect_null_ptr(i1 %cond, ptr %ptr) {
+; X64-LABEL: test_ctselect_null_ptr:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_null_ptr:
+; X32:       # %bb.0:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %ptr, ptr null)
+  ret ptr %result
+}
+
+; Test with function pointers
+define ptr @test_ctselect_function_ptr(i1 %cond, ptr %func1, ptr %func2) {
+; X64-LABEL: test_ctselect_function_ptr:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_function_ptr:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %func1, ptr %func2)
+  ret ptr %result
+}
+
+; Test with volatile loads
+define i32 @test_ctselect_volatile_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_volatile_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_volatile_load:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    retl
+  %a = load volatile i32, ptr %p1
+  %b = load volatile i32, ptr %p2
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with atomic loads
+define i32 @test_ctselect_atomic_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_atomic_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rsi), %ecx
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_atomic_load:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    retl
+  %a = load atomic i32, ptr %p1 acquire, align 4
+  %b = load atomic i32, ptr %p2 acquire, align 4
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with condition from icmp on pointers
+define ptr @test_ctselect_ptr_cmp(ptr %p1, ptr %p2, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_ptr_cmp:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovneq %rdx, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_ptr_cmp:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    sete %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+  %cmp = icmp eq ptr %p1, %p2
+  %result = call ptr @llvm.ct.select.p0(i1 %cmp, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with struct pointer types (struct types themselves may not be directly supported)
+%struct.pair = type { i32, i32 }
+
+define ptr @test_ctselect_struct_ptr(i1 %cond, ptr %a, ptr %b) {
+; X64-LABEL: test_ctselect_struct_ptr:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_struct_ptr:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    retl
+  %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
+  ret ptr %result
+}
+
+; Test with deeply nested conditions (stress test for instruction selection)
+define i32 @test_ctselect_deeply_nested(i1 %c1, i1 %c2, i1 %c3, i1 %c4, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+; X64-LABEL: test_ctselect_deeply_nested:
+; X64:       # %bb.0:
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %r8d, %r9d
+; X64-NEXT:    testb $1, %sil
+; X64-NEXT:    cmovnel %r9d, %r11d
+; X64-NEXT:    testb $1, %dl
+; X64-NEXT:    cmovnel %r11d, %r10d
+; X64-NEXT:    testb $1, %cl
+; X64-NEXT:    cmovnel %r10d, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_deeply_nested:
+; X32:       # %bb.0:
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    .cfi_offset %esi, -8
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %esi, %edx
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %edx, %ecx
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %ecx, %eax
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    retl
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+  %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+  %sel4 = call i32 @llvm.ct.select.i32(i1 %c4, i32 %sel3, i32 %e)
+  ret i32 %sel4
+}
+
+; Test with misaligned loads
+define i32 @test_ctselect_misaligned_load(i1 %cond, ptr %p1, ptr %p2) {
+; X64-LABEL: test_ctselect_misaligned_load:
+; X64:       # %bb.0:
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel (%rsi), %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_ctselect_misaligned_load:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel (%ecx), %eax
+; X32-NEXT:    retl
+  %a = load i32, ptr %p1, align 1
+  %b = load i32, ptr %p2, align 1
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Declare the intrinsics
+declare i1 @llvm.ct.select.i1(i1, i1, i1)
+declare i128 @llvm.ct.select.i128(i1, i128, i128)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare ptr @llvm.ct.select.p0(i1, ptr, ptr)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
new file mode 100644
index 0000000000000..ea943307c644f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Comprehensive CTSELECT tests for i386 targets with floating-point types
+; - Without CMOV: constant-time implementation using FP->int conversion + existing post-RA CTSELECT
+; - With CMOV: CMOV-based implementation
+; - Verifies security properties: no conditional branches, constant execution time
+; Strategy: FP values stored to memory, converted to integers, CTSELECT on integers, converted back to FP
+
+; Test basic f32 functionality
+define float @test_ctselect_f32_basic(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test f32 with different condition codes
+define float @test_ctselect_f32_eq(float %x, float %y, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_eq:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fucompp
+; I386-NOCMOV-NEXT:    fnstsw %ax
+; I386-NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
+; I386-NOCMOV-NEXT:    sahf
+; I386-NOCMOV-NEXT:    setnp %al
+; I386-NOCMOV-NEXT:    sete %cl
+; I386-NOCMOV-NEXT:    testb %al, %cl
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_eq:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fucompi %st(1), %st
+; I386-CMOV-NEXT:    fstp %st(0)
+; I386-CMOV-NEXT:    setnp %al
+; I386-CMOV-NEXT:    sete %cl
+; I386-CMOV-NEXT:    testb %al, %cl
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %cmp = fcmp oeq float %x, %y
+  %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b)
+  ret float %result
+}
+
+; Test basic f64 functionality
+define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f64_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $8, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldl (%esp)
+; I386-NOCMOV-NEXT:    addl $8, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f64_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $8, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldl (%esp)
+; I386-CMOV-NEXT:    addl $8, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
+  ret double %result
+}
+
+; Test basic x86_fp80 functionality
+define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f80_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $12, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt (%esp)
+; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f80_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldt (%esp)
+; I386-CMOV-NEXT:    addl $12, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
+  ret x86_fp80 %result
+}
+
+; Test f32 with complex conditions
+define float @test_ctselect_f32_gt(float %x, float %y, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_gt:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fucompp
+; I386-NOCMOV-NEXT:    fnstsw %ax
+; I386-NOCMOV-NEXT:    # kill: def $ah killed $ah killed $ax
+; I386-NOCMOV-NEXT:    sahf
+; I386-NOCMOV-NEXT:    seta %al
+; I386-NOCMOV-NEXT:    testb %al, %al
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_gt:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fucompi %st(1), %st
+; I386-CMOV-NEXT:    fstp %st(0)
+; I386-CMOV-NEXT:    seta %al
+; I386-CMOV-NEXT:    testb %al, %al
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %cmp = fcmp ogt float %x, %y
+  %result = call float @llvm.ct.select.f32(i1 %cmp, float %a, float %b)
+  ret float %result
+}
+
+; Test constant-time properties: verify no branches in generated code
+define float @test_ctselect_f32_no_branches(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_no_branches:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_no_branches:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test that BUNDLE directives are present for constant-time guarantees
+define float @test_ctselect_f32_bundled(i1 %cond, float %a, float %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_bundled:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    pushl %eax
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $4, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_bundled:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    pushl %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $4, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
+  ret float %result
+}
+
+; Test edge case: NaN handling
+define float @test_ctselect_f32_nan(i1 %cond) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_nan:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $12, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; I386-NOCMOV-NEXT:    fldz
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    fxch %st(1)
+; I386-NOCMOV-NEXT:    fstps {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fstps (%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl (%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_nan:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; I386-CMOV-NEXT:    fldz
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    fxch %st(1)
+; I386-CMOV-NEXT:    fstps {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fstps (%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl (%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    flds {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    addl $12, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %nan = bitcast i32 2139095040 to float  ; 0x7F800000 = +inf
+  %zero = bitcast i32 0 to float
+  %result = call float @llvm.ct.select.f32(i1 %cond, float %nan, float %zero)
+  ret float %result
+}
+
+; Test memory alignment for f80
+define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f80_alignment:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $12, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt (%esp)
+; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f80_alignment:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldt (%esp)
+; I386-CMOV-NEXT:    addl $12, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
+  ret x86_fp80 %result
+}
+
+; Stress test: multiple CTSELECT operations
+define float @test_ctselect_f32_multiple(i1 %cond1, i1 %cond2, float %a, float %b, float %c, float %d) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_f32_multiple:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    subl $8, %esp
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %ecx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, (%esp)
+; I386-NOCMOV-NEXT:    flds (%esp)
+; I386-NOCMOV-NEXT:    addl $8, %esp
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_f32_multiple:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %edi
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    subl $8, %esp
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movb %al, %ah
+; I386-CMOV-NEXT:    movzbl %ah, %edi
+; I386-CMOV-NEXT:    negl %edi
+; I386-CMOV-NEXT:    movl %edx, %esi
+; I386-CMOV-NEXT:    andl %edi, %esi
+; I386-CMOV-NEXT:    notl %edi
+; I386-CMOV-NEXT:    andl %ecx, %edi
+; I386-CMOV-NEXT:    orl %edi, %esi
+; I386-CMOV-NEXT:    movl %esi, (%esp)
+; I386-CMOV-NEXT:    flds (%esp)
+; I386-CMOV-NEXT:    addl $8, %esp
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    retl
+  %sel1 = call float @llvm.ct.select.f32(i1 %cond1, float %a, float %b)
+  %sel2 = call float @llvm.ct.select.f32(i1 %cond2, float %sel1, float %c)
+  ret float %sel2
+}
+
+; Declare intrinsics
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
+declare x86_fp80 @llvm.ct.select.f80(i1, x86_fp80, x86_fp80)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
new file mode 100644
index 0000000000000..bc7980c357e0e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386-mmx.ll
@@ -0,0 +1,428 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx < %s | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+cmov,+mmx < %s | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-cmov,+mmx -verify-machineinstrs < %s | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Test constant-time selection with MMX intrinsics to exercise VR64 CTSELECT
+; These tests use MMX intrinsics to create <1 x i64> values that get allocated to VR64 registers
+
+; Test MMX ct.select using paddd intrinsic to force VR64 allocation
+define <1 x i64> @test_mmx_ctselect_with_paddd(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_paddd:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    subl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %bl
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    sete %bh
+; I386-NOCMOV-NEXT:    movb %bh, %al
+; I386-NOCMOV-NEXT:    movzbl %al, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %esi, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %ecx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %ecx
+; I386-NOCMOV-NEXT:    andl %esi, %ecx
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT:    paddd %mm0, %mm0
+; I386-NOCMOV-NEXT:    movq %mm0, (%esp)
+; I386-NOCMOV-NEXT:    movl (%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    addl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    popl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_paddd:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    subl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %dl
+; I386-CMOV-NEXT:    testb %dl, %dl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT:    paddd %mm0, %mm0
+; I386-CMOV-NEXT:    movq %mm0, (%esp)
+; I386-CMOV-NEXT:    movl (%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    addl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT:    retl
+  %mmx_a = bitcast i64 %a to <1 x i64>
+  %mmx_b = bitcast i64 %b to <1 x i64>
+  %cmp = icmp ne i32 %cond, 0
+  %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+  %result = call <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64> %sel, <1 x i64> %sel)
+  ret <1 x i64> %result
+}
+
+; Test MMX ct.select using psllw intrinsic
+define <1 x i64> @test_mmx_ctselect_with_psllw(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_psllw:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    subl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %bl
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    sete %bh
+; I386-NOCMOV-NEXT:    movb %bh, %al
+; I386-NOCMOV-NEXT:    movzbl %al, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %esi, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %ecx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %ecx
+; I386-NOCMOV-NEXT:    andl %esi, %ecx
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT:    psllw %mm0, %mm0
+; I386-NOCMOV-NEXT:    movq %mm0, (%esp)
+; I386-NOCMOV-NEXT:    movl (%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    addl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    popl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_psllw:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    subl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %dl
+; I386-CMOV-NEXT:    testb %dl, %dl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT:    psllw %mm0, %mm0
+; I386-CMOV-NEXT:    movq %mm0, (%esp)
+; I386-CMOV-NEXT:    movl (%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    addl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT:    retl
+  %mmx_a = bitcast i64 %a to <1 x i64>
+  %mmx_b = bitcast i64 %b to <1 x i64>
+  %cmp = icmp ne i32 %cond, 0
+  %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+  %result = call <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64> %sel, <1 x i64> %sel)
+  ret <1 x i64> %result
+}
+
+; Test nested MMX ct.selects with pand intrinsic
+define <1 x i64> @test_mmx_nested_ctselect_with_pand(i32 %cond1, i32 %cond2, i64 %a, i64 %b, i64 %c) {
+; I386-NOCMOV-LABEL: test_mmx_nested_ctselect_with_pand:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    subl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %bl
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    sete %bh
+; I386-NOCMOV-NEXT:    movb %bh, %cl
+; I386-NOCMOV-NEXT:    movzbl %cl, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %edx, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %eax, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    sete %dl
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movb %dl, %dh
+; I386-NOCMOV-NEXT:    movzbl %dh, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %esi, %ebx
+; I386-NOCMOV-NEXT:    andl %ebp, %ebx
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %eax, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %ebx
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %dl
+; I386-NOCMOV-NEXT:    testb %dl, %dl
+; I386-NOCMOV-NEXT:    sete %dh
+; I386-NOCMOV-NEXT:    movb %dh, %al
+; I386-NOCMOV-NEXT:    movzbl %al, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %ecx, %esi
+; I386-NOCMOV-NEXT:    andl %ebp, %esi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %ebx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %esi
+; I386-NOCMOV-NEXT:    testb %dl, %dl
+; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; I386-NOCMOV-NEXT:    movb %al, %dl
+; I386-NOCMOV-NEXT:    movzbl %dl, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %ebx, %ecx
+; I386-NOCMOV-NEXT:    andl %esi, %ecx
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT:    pand %mm0, %mm0
+; I386-NOCMOV-NEXT:    movq %mm0, (%esp)
+; I386-NOCMOV-NEXT:    movl (%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    addl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    popl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_mmx_nested_ctselect_with_pand:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    pushl %ebx
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-CMOV-NEXT:    pushl %esi
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-CMOV-NEXT:    subl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 32
+; I386-CMOV-NEXT:    .cfi_offset %esi, -12
+; I386-CMOV-NEXT:    .cfi_offset %ebx, -8
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %bl
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %bh
+; I386-CMOV-NEXT:    testb %bh, %bh
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %esi
+; I386-CMOV-NEXT:    testb %bl, %bl
+; I386-CMOV-NEXT:    cmovnel %esi, %edx
+; I386-CMOV-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel %ecx, %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT:    pand %mm0, %mm0
+; I386-CMOV-NEXT:    movq %mm0, (%esp)
+; I386-CMOV-NEXT:    movl (%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    addl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-CMOV-NEXT:    popl %esi
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-CMOV-NEXT:    popl %ebx
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT:    retl
+  %mmx_a = bitcast i64 %a to <1 x i64>
+  %mmx_b = bitcast i64 %b to <1 x i64>
+  %mmx_c = bitcast i64 %c to <1 x i64>
+  %cmp1 = icmp ne i32 %cond1, 0
+  %cmp2 = icmp ne i32 %cond2, 0
+  %sel1 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp2, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+  %sel2 = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp1, <1 x i64> %sel1, <1 x i64> %mmx_c)
+  %result = call <1 x i64> @llvm.x86.mmx.pand(<1 x i64> %sel2, <1 x i64> %sel2)
+  ret <1 x i64> %result
+}
+
+; Test MMX ct.select with por intrinsic
+define <1 x i64> @test_mmx_ctselect_with_por(i32 %cond, i64 %a, i64 %b) {
+; I386-NOCMOV-LABEL: test_mmx_ctselect_with_por:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    subl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 40
+; I386-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; I386-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; I386-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; I386-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %bl
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    sete %bh
+; I386-NOCMOV-NEXT:    movb %bh, %al
+; I386-NOCMOV-NEXT:    movzbl %al, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %esi, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %ecx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    testb %bl, %bl
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT:    movb %al, %ah
+; I386-NOCMOV-NEXT:    movzbl %ah, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %ecx
+; I386-NOCMOV-NEXT:    andl %esi, %ecx
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-NOCMOV-NEXT:    por %mm0, %mm0
+; I386-NOCMOV-NEXT:    movq %mm0, (%esp)
+; I386-NOCMOV-NEXT:    movl (%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    addl $20, %esp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; I386-NOCMOV-NEXT:    popl %ebp
+; I386-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_mmx_ctselect_with_por:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    subl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 24
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %dl
+; I386-CMOV-NEXT:    testb %dl, %dl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movq {{[0-9]+}}(%esp), %mm0
+; I386-CMOV-NEXT:    por %mm0, %mm0
+; I386-CMOV-NEXT:    movq %mm0, (%esp)
+; I386-CMOV-NEXT:    movl (%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-CMOV-NEXT:    addl $20, %esp
+; I386-CMOV-NEXT:    .cfi_def_cfa_offset 4
+; I386-CMOV-NEXT:    retl
+  %mmx_a = bitcast i64 %a to <1 x i64>
+  %mmx_b = bitcast i64 %b to <1 x i64>
+  %cmp = icmp ne i32 %cond, 0
+  %sel = call <1 x i64> @llvm.ct.select.v1i64(i1 %cmp, <1 x i64> %mmx_a, <1 x i64> %mmx_b)
+  %result = call <1 x i64> @llvm.x86.mmx.por(<1 x i64> %sel, <1 x i64> %sel)
+  ret <1 x i64> %result
+}
+
+; Declare MMX intrinsics
+declare <1 x i64> @llvm.x86.mmx.padd.d(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.psll.w(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.pand(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.x86.mmx.por(<1 x i64>, <1 x i64>)
+
+; Declare constant-time selection intrinsic
+declare <1 x i64> @llvm.ct.select.v1i64(i1, <1 x i64>, <1 x i64>)
diff --git a/llvm/test/CodeGen/X86/ctselect-i386.ll b/llvm/test/CodeGen/X86/ctselect-i386.ll
new file mode 100644
index 0000000000000..d7345f1121540
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-i386.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov | FileCheck %s --check-prefix=I386-NOCMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+cmov | FileCheck %s --check-prefix=I386-CMOV
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix=I386-NOCMOV
+
+; Comprehensive CTSELECT tests for i386 targets with scalar integer types
+; - Without CMOV: constant-time implementation using post-RA expansion with bundled instructions
+; - With CMOV: CMOV-based implementation
+; - Verifies security properties: no conditional branches, constant execution time
+; All expansion happens post-RA for better optimization control and constant-time guarantees
+
+; Test basic i32 functionality
+define i32 @test_ctselect_i32_basic(i1 %cond, i32 %a, i32 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i32_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %eax
+; I386-NOCMOV-NEXT:    andl %esi, %eax
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %ecx, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_i32_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    retl
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test i16 functionality
+define i16 @test_ctselect_i16_basic(i1 %cond, i16 %a, i16 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i16_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbw %bh, %si
+; I386-NOCMOV-NEXT:    negw %si
+; I386-NOCMOV-NEXT:    movw %dx, %ax
+; I386-NOCMOV-NEXT:    andw %si, %ax
+; I386-NOCMOV-NEXT:    notw %si
+; I386-NOCMOV-NEXT:    andw %cx, %si
+; I386-NOCMOV-NEXT:    orw %si, %ax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_i16_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnew {{[0-9]+}}(%esp), %ax
+; I386-CMOV-NEXT:    retl
+  %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
+  ret i16 %result
+}
+
+; Test i8 functionality
+define i8 @test_ctselect_i8_basic(i1 %cond, i8 %a, i8 %b) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i8_basic:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %ah
+; I386-NOCMOV-NEXT:    movb %ah, %ch
+; I386-NOCMOV-NEXT:    negb %ch
+; I386-NOCMOV-NEXT:    movb %dl, %al
+; I386-NOCMOV-NEXT:    andb %ch, %al
+; I386-NOCMOV-NEXT:    notb %ch
+; I386-NOCMOV-NEXT:    andb %cl, %ch
+; I386-NOCMOV-NEXT:    orb %ch, %al
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_i8_basic:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
+; I386-CMOV-NEXT:    retl
+  %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
+  ret i8 %result
+}
+
+; Test security property: constant-time execution for cryptographic use case
+define i32 @test_crypto_key_select(i32 %secret_bit, i32 %key1, i32 %key2) nounwind {
+; I386-NOCMOV-LABEL: test_crypto_key_select:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %al
+; I386-NOCMOV-NEXT:    testb %al, %al
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %eax
+; I386-NOCMOV-NEXT:    andl %esi, %eax
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %ecx, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_crypto_key_select:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %cl
+; I386-CMOV-NEXT:    testb %cl, %cl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    retl
+  %cond = icmp ne i32 %secret_bit, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %key1, i32 %key2)
+  ret i32 %result
+}
+
+; Test that no conditional branches appear in constant-time path
+define i32 @test_no_conditional_branches(i32 %secret, i32 %val1, i32 %val2) nounwind {
+; I386-NOCMOV-LABEL: test_no_conditional_branches:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    setne %al
+; I386-NOCMOV-NEXT:    testb %al, %al
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %eax
+; I386-NOCMOV-NEXT:    andl %esi, %eax
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %ecx, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_no_conditional_branches:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    setne %cl
+; I386-CMOV-NEXT:    testb %cl, %cl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    retl
+  %cond = icmp ne i32 %secret, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %val1, i32 %val2)
+  ret i32 %result
+}
+
+; Test with comparison condition
+define i32 @test_ctselect_i32_cmp(i32 %a, i32 %b, i32 %c) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_i32_cmp:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    cmpl %edx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    testb %al, %al
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %esi
+; I386-NOCMOV-NEXT:    negl %esi
+; I386-NOCMOV-NEXT:    movl %edx, %eax
+; I386-NOCMOV-NEXT:    andl %esi, %eax
+; I386-NOCMOV-NEXT:    notl %esi
+; I386-NOCMOV-NEXT:    andl %ecx, %esi
+; I386-NOCMOV-NEXT:    orl %esi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_i32_cmp:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    sete %cl
+; I386-CMOV-NEXT:    testb %cl, %cl
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    retl
+  %cond = icmp eq i32 %a, %c
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %b, i32 %c)
+  ret i32 %result
+}
+
+; Test nested selects
+define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) nounwind {
+; I386-NOCMOV-LABEL: test_ctselect_nested:
+; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebx
+; I386-NOCMOV-NEXT:    pushl %edi
+; I386-NOCMOV-NEXT:    pushl %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %bl
+; I386-NOCMOV-NEXT:    movb %bl, %bh
+; I386-NOCMOV-NEXT:    movzbl %bh, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %edx, %esi
+; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %eax, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %esi
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %dl
+; I386-NOCMOV-NEXT:    movb %dl, %dh
+; I386-NOCMOV-NEXT:    movzbl %dh, %edi
+; I386-NOCMOV-NEXT:    negl %edi
+; I386-NOCMOV-NEXT:    movl %ecx, %eax
+; I386-NOCMOV-NEXT:    andl %edi, %eax
+; I386-NOCMOV-NEXT:    notl %edi
+; I386-NOCMOV-NEXT:    andl %esi, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %eax
+; I386-NOCMOV-NEXT:    popl %esi
+; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    retl
+;
+; I386-CMOV-LABEL: test_ctselect_nested:
+; I386-CMOV:       # %bb.0:
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel %ecx, %eax
+; I386-CMOV-NEXT:    retl
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %sel1, i32 %c)
+  ret i32 %sel2
+}
+
+; Declare ct.select intrinsics
+declare i8 @llvm.ct.select.i8(i1, i8, i8)
+declare i16 @llvm.ct.select.i16(i1, i16, i16)
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
diff --git a/llvm/test/CodeGen/X86/ctselect-optimization.ll b/llvm/test/CodeGen/X86/ctselect-optimization.ll
new file mode 100644
index 0000000000000..481d49971a937
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-optimization.ll
@@ -0,0 +1,304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+cmov | FileCheck %s
+
+; Test ct.select optimization patterns
+
+; Test smin(x, 0) pattern optimization
+define i32 @test_ctselect_smin_zero(i32 %x) {
+; CHECK-LABEL: test_ctselect_smin_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+  ret i32 %result
+}
+
+; Test smax(x, 0) pattern optimization
+define i32 @test_ctselect_smax_zero(i32 %x) {
+; CHECK-LABEL: test_ctselect_smax_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    setg %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp sgt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 0)
+  ret i32 %result
+}
+
+; Test generic smin pattern
+define i32 @test_ctselect_smin_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_smin_generic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    setl %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test generic smax pattern
+define i32 @test_ctselect_smax_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_smax_generic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    setg %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp sgt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test umin pattern
+define i32 @test_ctselect_umin_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_umin_generic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    setb %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ult i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test umax pattern
+define i32 @test_ctselect_umax_generic(i32 %x, i32 %y) {
+; CHECK-LABEL: test_ctselect_umax_generic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    seta %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ugt i32 %x, %y
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %y)
+  ret i32 %result
+}
+
+; Test abs pattern
+define i32 @test_ctselect_abs(i32 %x) {
+; CHECK-LABEL: test_ctselect_abs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    negl %ecx
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %dl
+; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %neg = sub i32 0, %x
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %neg, i32 %x)
+  ret i32 %result
+}
+
+; Test nabs pattern (negative abs)
+define i32 @test_ctselect_nabs(i32 %x) {
+; CHECK-LABEL: test_ctselect_nabs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    negl %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %neg = sub i32 0, %x
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 %x, i32 %neg)
+  ret i32 %result
+}
+
+; Test sign extension pattern
+define i32 @test_ctselect_sign_extend(i32 %x) {
+; CHECK-LABEL: test_ctselect_sign_extend:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movl $-1, %ecx
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+  ret i32 %result
+}
+
+; Test zero extension pattern
+define i32 @test_ctselect_zero_extend(i32 %x) {
+; CHECK-LABEL: test_ctselect_zero_extend:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movl $1, %ecx
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ne i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 1, i32 0)
+  ret i32 %result
+}
+
+; Test mask generation pattern
+define i32 @test_ctselect_mask_generation(i32 %x) {
+; CHECK-LABEL: test_ctselect_mask_generation:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movl $-1, %ecx
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i32 %x, 0
+  %result = call i32 @llvm.ct.select.i32(i1 %cmp, i32 -1, i32 0)
+  ret i32 %result
+}
+
+; Test constant folding with known condition
+define i32 @test_ctselect_constant_folding_true(i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_constant_folding_true:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movb $1, %cl
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+define i32 @test_ctselect_constant_folding_false(i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_constant_folding_false:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %edi, %eax
+; CHECK-NEXT:    retq
+  %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test with identical operands
+define i32 @test_ctselect_identical_operands(i1 %cond, i32 %x) {
+; CHECK-LABEL: test_ctselect_identical_operands:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmovnel %esi, %eax
+; CHECK-NEXT:    retq
+  %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %x, i32 %x)
+  ret i32 %result
+}
+
+; Test with inverted condition
+define i32 @test_ctselect_inverted_condition(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-LABEL: test_ctselect_inverted_condition:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    sete %dl
+; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp eq i32 %x, %y
+  %not_cmp = xor i1 %cmp, true
+  %result = call i32 @llvm.ct.select.i32(i1 %not_cmp, i32 %a, i32 %b)
+  ret i32 %result
+}
+
+; Test for 64-bit specific optimizations
+define i64 @test_ctselect_i64_smin_zero(i64 %x) {
+; CHECK-LABEL: test_ctselect_i64_smin_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovneq %rdi, %rax
+; CHECK-NEXT:    retq
+  %cmp = icmp slt i64 %x, 0
+  %result = call i64 @llvm.ct.select.i64(i1 %cmp, i64 %x, i64 0)
+  ret i64 %result
+}
+
+; Test for floating point optimizations
+define float @test_ctselect_f32_zero_positive(float %x) {
+; CHECK-LABEL: test_ctselect_f32_zero_positive:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    seta %cl
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovnel %eax, %edx
+; CHECK-NEXT:    movd %edx, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = fcmp ogt float %x, 0.0
+  %result = call float @llvm.ct.select.f32(i1 %cmp, float %x, float 0.0)
+  ret float %result
+}
+
+define double @test_ctselect_f64_zero_positive(double %x) {
+; CHECK-LABEL: test_ctselect_f64_zero_positive:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %xmm0, %rax
+; CHECK-NEXT:    xorpd %xmm1, %xmm1
+; CHECK-NEXT:    ucomisd %xmm1, %xmm0
+; CHECK-NEXT:    seta %cl
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    cmovneq %rax, %rdx
+; CHECK-NEXT:    movq %rdx, %xmm0
+; CHECK-NEXT:    retq
+  %cmp = fcmp ogt double %x, 0.0
+  %result = call double @llvm.ct.select.f64(i1 %cmp, double %x, double 0.0)
+  ret double %result
+}
+
+; Test chain of ct.select operations
+define i32 @test_ctselect_chain(i1 %c1, i1 %c2, i1 %c3, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: test_ctselect_chain:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmovnel %ecx, %r8d
+; CHECK-NEXT:    testb $1, %sil
+; CHECK-NEXT:    cmovnel %r8d, %r9d
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovnel %r9d, %eax
+; CHECK-NEXT:    retq
+  %sel1 = call i32 @llvm.ct.select.i32(i1 %c1, i32 %a, i32 %b)
+  %sel2 = call i32 @llvm.ct.select.i32(i1 %c2, i32 %sel1, i32 %c)
+  %sel3 = call i32 @llvm.ct.select.i32(i1 %c3, i32 %sel2, i32 %d)
+  ret i32 %sel3
+}
+
+; Declare the intrinsics
+declare i32 @llvm.ct.select.i32(i1, i32, i32)
+declare i64 @llvm.ct.select.i64(i1, i64, i64)
+declare float @llvm.ct.select.f32(i1, float, float)
+declare double @llvm.ct.select.f64(i1, double, double)
diff --git a/llvm/test/CodeGen/X86/ctselect-vector.ll b/llvm/test/CodeGen/X86/ctselect-vector.ll
new file mode 100644
index 0000000000000..2206e32cd6d34
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctselect-vector.ll
@@ -0,0 +1,1274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+
+; Test ct.select functionality for vector types
+
+; 128-bit vectors
+define <4 x i32> @test_ctselect_v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB0_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %xmm0, %xmm1
+; AVX512-NEXT:  .LBB0_2:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+define <4 x float> @test_ctselect_v4f32(i1 %cond, <4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: test_ctselect_v4f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB1_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %xmm0, %xmm1
+; AVX512-NEXT:  .LBB1_2:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <4 x float> @llvm.ct.select.v4f32(i1 %cond, <4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <2 x i64> @test_ctselect_v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v2i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB2_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %xmm0, %xmm1
+; AVX512-NEXT:  .LBB2_2:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <2 x i64> @llvm.ct.select.v2i64(i1 %cond, <2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %result
+}
+
+define <2 x double> @test_ctselect_v2f64(i1 %cond, <2 x double> %a, <2 x double> %b) {
+; SSE2-LABEL: test_ctselect_v2f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v2f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB3_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovapd %xmm0, %xmm1
+; AVX512-NEXT:  .LBB3_2:
+; AVX512-NEXT:    vmovapd %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <2 x double> @llvm.ct.select.v2f64(i1 %cond, <2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+; 256-bit vectors
+define <8 x i32> @test_ctselect_v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v8i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm3, %ymm3
+; AVX-NEXT:    vmovd %eax, %ymm3
+; AVX-NEXT:    vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX-NEXT:    pand %ymm0, %ymm3
+; AVX-NEXT:    pandn %ymm1, %ymm2
+; AVX-NEXT:    por %ymm3, %ymm2
+; AVX-NEXT:    vmovaps %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm3, %ymm3
+; AVX2-NEXT:    vmovd %eax, %ymm3
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX2-NEXT:    pand %ymm0, %ymm3
+; AVX2-NEXT:    pandn %ymm1, %ymm2
+; AVX2-NEXT:    por %ymm3, %ymm2
+; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB4_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %ymm0, %ymm1
+; AVX512-NEXT:  .LBB4_2:
+; AVX512-NEXT:    vmovaps %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %result = call <8 x i32> @llvm.ct.select.v8i32(i1 %cond, <8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %result
+}
+
+define <8 x float> @test_ctselect_v8f32(i1 %cond, <8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: test_ctselect_v8f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v8f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm3, %ymm3
+; AVX-NEXT:    vmovd %eax, %ymm3
+; AVX-NEXT:    vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX-NEXT:    pand %ymm0, %ymm3
+; AVX-NEXT:    pandn %ymm1, %ymm2
+; AVX-NEXT:    por %ymm3, %ymm2
+; AVX-NEXT:    vmovaps %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v8f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm3, %ymm3
+; AVX2-NEXT:    vmovd %eax, %ymm3
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX2-NEXT:    pand %ymm0, %ymm3
+; AVX2-NEXT:    pandn %ymm1, %ymm2
+; AVX2-NEXT:    por %ymm3, %ymm2
+; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v8f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB5_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %ymm0, %ymm1
+; AVX512-NEXT:  .LBB5_2:
+; AVX512-NEXT:    vmovaps %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %result = call <8 x float> @llvm.ct.select.v8f32(i1 %cond, <8 x float> %a, <8 x float> %b)
+  ret <8 x float> %result
+}
+
+define <4 x i64> @test_ctselect_v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v4i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm3, %ymm3
+; AVX-NEXT:    vmovd %eax, %ymm3
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX-NEXT:    pand %ymm0, %ymm3
+; AVX-NEXT:    pandn %ymm1, %ymm2
+; AVX-NEXT:    por %ymm3, %ymm2
+; AVX-NEXT:    vmovaps %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm3, %ymm3
+; AVX2-NEXT:    vmovd %eax, %ymm3
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX2-NEXT:    pand %ymm0, %ymm3
+; AVX2-NEXT:    pandn %ymm1, %ymm2
+; AVX2-NEXT:    por %ymm3, %ymm2
+; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB6_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %ymm0, %ymm1
+; AVX512-NEXT:  .LBB6_2:
+; AVX512-NEXT:    vmovaps %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %result = call <4 x i64> @llvm.ct.select.v4i64(i1 %cond, <4 x i64> %a, <4 x i64> %b)
+  ret <4 x i64> %result
+}
+
+define <4 x double> @test_ctselect_v4f64(i1 %cond, <4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: test_ctselect_v4f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm5, %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm4, %xmm0
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm3, %ymm3
+; AVX-NEXT:    vmovd %eax, %ymm3
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX-NEXT:    pand %ymm0, %ymm3
+; AVX-NEXT:    pandn %ymm1, %ymm2
+; AVX-NEXT:    por %ymm3, %ymm2
+; AVX-NEXT:    vmovaps %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm3, %ymm3
+; AVX2-NEXT:    vmovd %eax, %ymm3
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm3[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm3, %ymm2
+; AVX2-NEXT:    pand %ymm0, %ymm3
+; AVX2-NEXT:    pandn %ymm1, %ymm2
+; AVX2-NEXT:    por %ymm3, %ymm2
+; AVX2-NEXT:    vmovaps %ymm2, %ymm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB7_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovapd %ymm0, %ymm1
+; AVX512-NEXT:  .LBB7_2:
+; AVX512-NEXT:    vmovapd %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %result = call <4 x double> @llvm.ct.select.v4f64(i1 %cond, <4 x double> %a, <4 x double> %b)
+  ret <4 x double> %result
+}
+
+; 512-bit vectors (AVX512 only)
+define <16 x i32> @test_ctselect_v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v16i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pandn %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm7, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm8, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    movaps %xmm5, %xmm2
+; SSE2-NEXT:    movaps %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v16i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm5, %ymm5
+; AVX-NEXT:    vmovd %eax, %ymm5
+; AVX-NEXT:    vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX-NEXT:    pand %ymm0, %ymm5
+; AVX-NEXT:    pandn %ymm2, %ymm4
+; AVX-NEXT:    por %ymm5, %ymm4
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm0, %ymm0
+; AVX-NEXT:    vmovd %eax, %ymm0
+; AVX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX-NEXT:    pand %ymm1, %ymm0
+; AVX-NEXT:    pandn %ymm3, %ymm2
+; AVX-NEXT:    por %ymm0, %ymm2
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vmovaps %ymm2, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v16i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm5, %ymm5
+; AVX2-NEXT:    vmovd %eax, %ymm5
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    pand %ymm0, %ymm5
+; AVX2-NEXT:    pandn %ymm2, %ymm4
+; AVX2-NEXT:    por %ymm5, %ymm4
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX2-NEXT:    pand %ymm1, %ymm0
+; AVX2-NEXT:    pandn %ymm3, %ymm2
+; AVX2-NEXT:    por %ymm0, %ymm2
+; AVX2-NEXT:    vmovaps %ymm4, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v16i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB8_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %zmm0, %zmm1
+; AVX512-NEXT:  .LBB8_2:
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %result = call <16 x i32> @llvm.ct.select.v16i32(i1 %cond, <16 x i32> %a, <16 x i32> %b)
+  ret <16 x i32> %result
+}
+
+define <16 x float> @test_ctselect_v16f32(i1 %cond, <16 x float> %a, <16 x float> %b) {
+; SSE2-LABEL: test_ctselect_v16f32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pandn %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movaps %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm7, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm8, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    movaps %xmm5, %xmm2
+; SSE2-NEXT:    movaps %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v16f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm5, %ymm5
+; AVX-NEXT:    vmovd %eax, %ymm5
+; AVX-NEXT:    vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX-NEXT:    pand %ymm0, %ymm5
+; AVX-NEXT:    pandn %ymm2, %ymm4
+; AVX-NEXT:    por %ymm5, %ymm4
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm0, %ymm0
+; AVX-NEXT:    vmovd %eax, %ymm0
+; AVX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX-NEXT:    pand %ymm1, %ymm0
+; AVX-NEXT:    pandn %ymm3, %ymm2
+; AVX-NEXT:    por %ymm0, %ymm2
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vmovaps %ymm2, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v16f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm5, %ymm5
+; AVX2-NEXT:    vmovd %eax, %ymm5
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    pand %ymm0, %ymm5
+; AVX2-NEXT:    pandn %ymm2, %ymm4
+; AVX2-NEXT:    por %ymm5, %ymm4
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %ymm0
+; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX2-NEXT:    pand %ymm1, %ymm0
+; AVX2-NEXT:    pandn %ymm3, %ymm2
+; AVX2-NEXT:    por %ymm0, %ymm2
+; AVX2-NEXT:    vmovaps %ymm4, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v16f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB9_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %zmm0, %zmm1
+; AVX512-NEXT:  .LBB9_2:
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %result = call <16 x float> @llvm.ct.select.v16f32(i1 %cond, <16 x float> %a, <16 x float> %b)
+  ret <16 x float> %result
+}
+
+define <8 x i64> @test_ctselect_v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b) {
+; SSE2-LABEL: test_ctselect_v8i64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pandn %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm7, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm8, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    movaps %xmm5, %xmm2
+; SSE2-NEXT:    movaps %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v8i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm5, %ymm5
+; AVX-NEXT:    vmovd %eax, %ymm5
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX-NEXT:    pand %ymm0, %ymm5
+; AVX-NEXT:    pandn %ymm2, %ymm4
+; AVX-NEXT:    por %ymm5, %ymm4
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm0, %ymm0
+; AVX-NEXT:    vmovd %eax, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX-NEXT:    pand %ymm1, %ymm0
+; AVX-NEXT:    pandn %ymm3, %ymm2
+; AVX-NEXT:    por %ymm0, %ymm2
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vmovaps %ymm2, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v8i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm5, %ymm5
+; AVX2-NEXT:    vmovd %eax, %ymm5
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    pand %ymm0, %ymm5
+; AVX2-NEXT:    pandn %ymm2, %ymm4
+; AVX2-NEXT:    por %ymm5, %ymm4
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %ymm0
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX2-NEXT:    pand %ymm1, %ymm0
+; AVX2-NEXT:    pandn %ymm3, %ymm2
+; AVX2-NEXT:    por %ymm0, %ymm2
+; AVX2-NEXT:    vmovaps %ymm4, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v8i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB10_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %zmm0, %zmm1
+; AVX512-NEXT:  .LBB10_2:
+; AVX512-NEXT:    vmovaps %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %result = call <8 x i64> @llvm.ct.select.v8i64(i1 %cond, <8 x i64> %a, <8 x i64> %b)
+  ret <8 x i64> %result
+}
+
+define <8 x double> @test_ctselect_v8f64(i1 %cond, <8 x double> %a, <8 x double> %b) {
+; SSE2-LABEL: test_ctselect_v8f64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    testb $1, %dil
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    movd %eax, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm9, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm9
+; SSE2-NEXT:    pandn %xmm4, %xmm8
+; SSE2-NEXT:    por %xmm9, %xmm8
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm0, %xmm4
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    por %xmm0, %xmm4
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm0, %xmm5
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pandn %xmm6, %xmm5
+; SSE2-NEXT:    por %xmm0, %xmm5
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movapd %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pandn %xmm7, %xmm6
+; SSE2-NEXT:    por %xmm0, %xmm6
+; SSE2-NEXT:    movaps %xmm8, %xmm0
+; SSE2-NEXT:    movaps %xmm4, %xmm1
+; SSE2-NEXT:    movaps %xmm5, %xmm2
+; SSE2-NEXT:    movaps %xmm6, %xmm3
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v8f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    testb $1, %dil
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm5, %ymm5
+; AVX-NEXT:    vmovd %eax, %ymm5
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX-NEXT:    pand %ymm0, %ymm5
+; AVX-NEXT:    pandn %ymm2, %ymm4
+; AVX-NEXT:    por %ymm5, %ymm4
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %ymm0, %ymm0
+; AVX-NEXT:    vmovd %eax, %ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX-NEXT:    pand %ymm1, %ymm0
+; AVX-NEXT:    pandn %ymm3, %ymm2
+; AVX-NEXT:    por %ymm0, %ymm2
+; AVX-NEXT:    vmovaps %ymm4, %ymm0
+; AVX-NEXT:    vmovaps %ymm2, %ymm1
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v8f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm5, %ymm5
+; AVX2-NEXT:    vmovd %eax, %ymm5
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm5[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm5, %ymm4
+; AVX2-NEXT:    pand %ymm0, %ymm5
+; AVX2-NEXT:    pandn %ymm2, %ymm4
+; AVX2-NEXT:    por %ymm5, %ymm4
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %eax, %ymm0
+; AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2-NEXT:    vmovdqa %ymm0, %ymm2
+; AVX2-NEXT:    pand %ymm1, %ymm0
+; AVX2-NEXT:    pandn %ymm3, %ymm2
+; AVX2-NEXT:    por %ymm0, %ymm2
+; AVX2-NEXT:    vmovaps %ymm4, %ymm0
+; AVX2-NEXT:    vmovaps %ymm2, %ymm1
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v8f64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    testb %dil, %dil
+; AVX512-NEXT:    je .LBB11_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovapd %zmm0, %zmm1
+; AVX512-NEXT:  .LBB11_2:
+; AVX512-NEXT:    vmovapd %zmm1, %zmm0
+; AVX512-NEXT:    retq
+  %result = call <8 x double> @llvm.ct.select.v8f64(i1 %cond, <8 x double> %a, <8 x double> %b)
+  ret <8 x double> %result
+}
+
+; Test with constant conditions for vector types
+define <4 x i32> @test_ctselect_v4i32_const_true(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_const_true:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movb $1, %al
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i32_const_true:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movb $1, %al
+; AVX-NEXT:    testb %al, %al
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_const_true:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movb $1, %al
+; AVX2-NEXT:    testb %al, %al
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i32_const_true:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    retq
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 true, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @test_ctselect_v4i32_const_false(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_const_false:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i32_const_false:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    testb %al, %al
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_const_false:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    testb %al, %al
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i32_const_false:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 false, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Test with comparison conditions for vector types
+define <4 x i32> @test_ctselect_v4i32_icmp(i32 %x, i32 %y, <4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: test_ctselect_v4i32_icmp:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    cmpl %esi, %edi
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    testb %al, %al
+; SSE2-NEXT:    movl $0, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    movzbl %al, %eax
+; SSE2-NEXT:    negl %eax
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; AVX-LABEL: test_ctselect_v4i32_icmp:
+; AVX:       # %bb.0:
+; AVX-NEXT:    cmpl %esi, %edi
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    testb %al, %al
+; AVX-NEXT:    movl $0, %eax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    movzbl %al, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    pxor %xmm3, %xmm3
+; AVX-NEXT:    movd %eax, %xmm3
+; AVX-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX-NEXT:    movdqa %xmm3, %xmm2
+; AVX-NEXT:    pand %xmm0, %xmm3
+; AVX-NEXT:    pandn %xmm1, %xmm2
+; AVX-NEXT:    por %xmm3, %xmm2
+; AVX-NEXT:    vmovaps %xmm2, %xmm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: test_ctselect_v4i32_icmp:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    cmpl %esi, %edi
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    testb %al, %al
+; AVX2-NEXT:    movl $0, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    movzbl %al, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    pxor %xmm3, %xmm3
+; AVX2-NEXT:    movd %eax, %xmm3
+; AVX2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; AVX2-NEXT:    movdqa %xmm3, %xmm2
+; AVX2-NEXT:    pand %xmm0, %xmm3
+; AVX2-NEXT:    pandn %xmm1, %xmm2
+; AVX2-NEXT:    por %xmm3, %xmm2
+; AVX2-NEXT:    vmovaps %xmm2, %xmm0
+; AVX2-NEXT:    retq
+; AVX512-LABEL: test_ctselect_v4i32_icmp:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    cmpl %esi, %edi
+; AVX512-NEXT:    je .LBB14_2
+; AVX512-NEXT:  # %bb.1:
+; AVX512-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512-NEXT:  .LBB14_2:
+; AVX512-NEXT:    retq
+  %cond = icmp eq i32 %x, %y
+  %result = call <4 x i32> @llvm.ct.select.v4i32(i1 %cond, <4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %result
+}
+
+; Declare the intrinsics
+declare <4 x i32> @llvm.ct.select.v4i32(i1, <4 x i32>, <4 x i32>)
+declare <4 x float> @llvm.ct.select.v4f32(i1, <4 x float>, <4 x float>)
+declare <2 x i64> @llvm.ct.select.v2i64(i1, <2 x i64>, <2 x i64>)
+declare <2 x double> @llvm.ct.select.v2f64(i1, <2 x double>, <2 x double>)
+declare <8 x i32> @llvm.ct.select.v8i32(i1, <8 x i32>, <8 x i32>)
+declare <8 x float> @llvm.ct.select.v8f32(i1, <8 x float>, <8 x float>)
+declare <4 x i64> @llvm.ct.select.v4i64(i1, <4 x i64>, <4 x i64>)
+declare <4 x double> @llvm.ct.select.v4f64(i1, <4 x double>, <4 x double>)
+declare <16 x i32> @llvm.ct.select.v16i32(i1, <16 x i32>, <16 x i32>)
+declare <16 x float> @llvm.ct.select.v16f32(i1, <16 x float>, <16 x float>)
+declare <8 x i64> @llvm.ct.select.v8i64(i1, <8 x i64>, <8 x i64>)
+declare <8 x double> @llvm.ct.select.v8f64(i1, <8 x double>, <8 x double>)
diff --git a/llvm/test/CodeGen/X86/ctselect.ll b/llvm/test/CodeGen/X86/ctselect.ll
index 095787a5e2a4b..d76ae0365f28c 100644
--- a/llvm/test/CodeGen/X86/ctselect.ll
+++ b/llvm/test/CodeGen/X86/ctselect.ll
@@ -8,39 +8,33 @@
 define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
 ; X64-LABEL: test_ctselect_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andb $1, %dil
-; X64-NEXT:    leal -1(%rdi), %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    negb %cl
-; X64-NEXT:    andb %sil, %cl
-; X64-NEXT:    andb %dl, %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i8:
 ; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andb $1, %al
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negb %cl
-; X32-NEXT:    andb {{[0-9]+}}(%esp), %cl
-; X32-NEXT:    decb %al
-; X32-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    orb %cl, %al
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    # kill: def $al killed $al killed $eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_i8:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andb $1, %al
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negb %cl
-; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %cl
-; X32-NOCMOV-NEXT:    decb %al
-; X32-NOCMOV-NEXT:    andb {{[0-9]+}}(%esp), %al
-; X32-NOCMOV-NEXT:    orb %cl, %al
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %ah
+; X32-NOCMOV-NEXT:    movb %ah, %ch
+; X32-NOCMOV-NEXT:    negb %ch
+; X32-NOCMOV-NEXT:    movb %dl, %al
+; X32-NOCMOV-NEXT:    andb %ch, %al
+; X32-NOCMOV-NEXT:    notb %ch
+; X32-NOCMOV-NEXT:    andb %cl, %ch
+; X32-NOCMOV-NEXT:    orb %ch, %al
 ; X32-NOCMOV-NEXT:    retl
   %result = call i8 @llvm.ct.select.i8(i1 %cond, i8 %a, i8 %b)
   ret i8 %result
@@ -49,39 +43,43 @@ define i8 @test_ctselect_i8(i1 %cond, i8 %a, i8 %b) {
 define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
 ; X64-LABEL: test_ctselect_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leal -1(%rdi), %ecx
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    negl %eax
-; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    andl %edx, %ecx
-; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i16:
 ; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    leal -1(%eax), %ecx
-; X32-NEXT:    andw {{[0-9]+}}(%esp), %cx
-; X32-NEXT:    negl %eax
-; X32-NEXT:    andw {{[0-9]+}}(%esp), %ax
-; X32-NEXT:    orl %ecx, %eax
-; X32-NEXT:    # kill: def $ax killed $ax killed $eax
+; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnew {{[0-9]+}}(%esp), %ax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_i16:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    leal -1(%eax), %ecx
-; X32-NOCMOV-NEXT:    andw {{[0-9]+}}(%esp), %cx
-; X32-NOCMOV-NEXT:    negl %eax
-; X32-NOCMOV-NEXT:    andw {{[0-9]+}}(%esp), %ax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
-; X32-NOCMOV-NEXT:    # kill: def $ax killed $ax killed $eax
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbw %bh, %si
+; X32-NOCMOV-NEXT:    negw %si
+; X32-NOCMOV-NEXT:    movw %dx, %ax
+; X32-NOCMOV-NEXT:    andw %si, %ax
+; X32-NOCMOV-NEXT:    notw %si
+; X32-NOCMOV-NEXT:    andw %cx, %si
+; X32-NOCMOV-NEXT:    orw %si, %ax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call i16 @llvm.ct.select.i16(i1 %cond, i16 %a, i16 %b)
   ret i16 %result
@@ -90,38 +88,42 @@ define i16 @test_ctselect_i16(i1 %cond, i16 %a, i16 %b) {
 define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leal -1(%rdi), %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl %esi, %ecx
-; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i32:
 ; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_i32:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
   ret i32 %result
@@ -130,56 +132,66 @@ define i32 @test_ctselect_i32(i1 %cond, i32 %a, i32 %b) {
 define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 ; X64-LABEL: test_ctselect_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leaq -1(%rdi), %rax
-; X64-NEXT:    negq %rdi
-; X64-NEXT:    andq %rsi, %rdi
-; X64-NEXT:    andq %rdx, %rax
-; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_i64:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %esi, -8
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    xorl %edx, %eax
-; X32-NEXT:    andl $1, %esi
-; X32-NEXT:    negl %esi
-; X32-NEXT:    andl %esi, %eax
-; X32-NEXT:    xorl %edx, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    andl %esi, %edx
-; X32-NEXT:    xorl %ecx, %edx
-; X32-NEXT:    popl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_i64:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    pushl %ebp
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -8
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -20
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebp, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    xorl %edx, %eax
-; X32-NOCMOV-NEXT:    andl $1, %esi
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    testb $1, %bl
+; X32-NOCMOV-NEXT:    sete %bh
+; X32-NOCMOV-NEXT:    movb %bh, %cl
+; X32-NOCMOV-NEXT:    movzbl %cl, %esi
 ; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
 ; X32-NOCMOV-NEXT:    andl %esi, %eax
-; X32-NOCMOV-NEXT:    xorl %edx, %eax
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ebp, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    testb $1, %bl
+; X32-NOCMOV-NEXT:    sete %cl
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NOCMOV-NEXT:    movb %cl, %ch
+; X32-NOCMOV-NEXT:    movzbl %ch, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edi, %edx
 ; X32-NOCMOV-NEXT:    andl %esi, %edx
-; X32-NOCMOV-NEXT:    xorl %ecx, %edx
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ebx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %edx
 ; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebp
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call i64 @llvm.ct.select.i64(i1 %cond, i64 %a, i64 %b)
@@ -189,51 +201,74 @@ define i64 @test_ctselect_i64(i1 %cond, i64 %a, i64 %b) {
 define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
 ; X64-LABEL: test_ctselect_f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm1, %eax
-; X64-NEXT:    movd %xmm0, %ecx
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    movl %edi, %edx
-; X64-NEXT:    negl %edx
-; X64-NEXT:    andl %ecx, %edx
-; X64-NEXT:    decl %edi
-; X64-NEXT:    andl %eax, %edi
-; X64-NEXT:    orl %edx, %edi
-; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    movd %xmm1, %ecx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %eax, %ecx
+; X64-NEXT:    movd %ecx, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_f32:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
+; X32-NEXT:    pushl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
-; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    sete %al
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, (%esp)
 ; X32-NEXT:    flds (%esp)
-; X32-NEXT:    popl %eax
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_f32:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %eax
+; X32-NOCMOV-NEXT:    pushl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
-; X32-NOCMOV-NEXT:    movl %eax, (%esp)
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %al
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movb %al, %ah
+; X32-NOCMOV-NEXT:    movzbl %ah, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %ecx, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    movl %esi, (%esp)
 ; X32-NOCMOV-NEXT:    flds (%esp)
-; X32-NOCMOV-NEXT:    popl %eax
+; X32-NOCMOV-NEXT:    addl $4, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call float @llvm.ct.select.f32(i1 %cond, float %a, float %b)
@@ -243,74 +278,96 @@ define float @test_ctselect_f32(i1 %cond, float %a, float %b) {
 define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
 ; X64-LABEL: test_ctselect_f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movq %xmm1, %rax
-; X64-NEXT:    movq %xmm0, %rcx
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    movq %rdi, %rdx
-; X64-NEXT:    negq %rdx
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    decq %rdi
-; X64-NEXT:    andq %rax, %rdi
-; X64-NEXT:    orq %rdx, %rdi
-; X64-NEXT:    movq %rdi, %xmm0
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    movq %xmm1, %rcx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rax, %rcx
+; X64-NEXT:    movq %rcx, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_f64:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NEXT:    subl $8, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 16
-; X32-NEXT:    .cfi_offset %esi, -8
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    .cfi_def_cfa_offset 20
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    sete %al
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    xorl %edx, %esi
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl %ecx, %esi
-; X32-NEXT:    xorl %edx, %esi
-; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, (%esp)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    xorl %eax, %edx
-; X32-NEXT:    andl %ecx, %edx
-; X32-NEXT:    xorl %eax, %edx
-; X32-NEXT:    movl %edx, (%esp)
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X32-NEXT:    fldl (%esp)
 ; X32-NEXT:    addl $8, %esp
-; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_f64:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    pushl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NOCMOV-NEXT:    subl $8, %esp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -8
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 20
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %al
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NOCMOV-NEXT:    xorl %edx, %esi
-; X32-NOCMOV-NEXT:    andl $1, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl %ecx, %esi
-; X32-NOCMOV-NEXT:    xorl %edx, %esi
-; X32-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    movb %al, %ah
+; X32-NOCMOV-NEXT:    movzbl %ah, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %ecx, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    movl %esi, (%esp)
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    xorl %eax, %edx
-; X32-NOCMOV-NEXT:    andl %ecx, %edx
-; X32-NOCMOV-NEXT:    xorl %eax, %edx
-; X32-NOCMOV-NEXT:    movl %edx, (%esp)
+; X32-NOCMOV-NEXT:    movb %al, %ah
+; X32-NOCMOV-NEXT:    movzbl %ah, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %ecx, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X32-NOCMOV-NEXT:    fldl (%esp)
 ; X32-NOCMOV-NEXT:    addl $8, %esp
-; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call double @llvm.ct.select.f64(i1 %cond, double %a, double %b)
@@ -320,37 +377,42 @@ define double @test_ctselect_f64(i1 %cond, double %a, double %b) {
 define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
 ; X64-LABEL: test_ctselect_ptr:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leaq -1(%rdi), %rax
-; X64-NEXT:    negq %rdi
-; X64-NEXT:    andq %rsi, %rdi
-; X64-NEXT:    andq %rdx, %rax
-; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovneq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_ptr:
 ; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_ptr:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call ptr @llvm.ct.select.p0(i1 %cond, ptr %a, ptr %b)
   ret ptr %result
@@ -360,17 +422,45 @@ define ptr @test_ctselect_ptr(i1 %cond, ptr %a, ptr %b) {
 define i32 @test_ctselect_const_true(i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_const_true:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movb $1, %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edi, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_const_true:
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movb $1, %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_const_true:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movb $1, %al
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 true, i32 %a, i32 %b)
   ret i32 %result
@@ -380,18 +470,44 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_const_false:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edi, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_const_false:
 ; X32:       # %bb.0:
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %ecx, %ecx
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_const_false:
 ; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %result = call i32 @llvm.ct.select.i32(i1 false, i32 %a, i32 %b)
   ret i32 %result
@@ -401,43 +517,50 @@ define i32 @test_ctselect_const_false(i32 %a, i32 %b) {
 define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_icmp_eq:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    sete %al
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    decl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_icmp_eq:
 ; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    sete %al
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    sete %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_icmp_eq:
 ; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    sete %al
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %cond = icmp eq i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -447,43 +570,50 @@ define i32 @test_ctselect_icmp_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
 define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_icmp_ne:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    setne %al
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    decl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    setne %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_icmp_ne:
 ; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    setne %al
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    setne %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_icmp_ne:
 ; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    setne %al
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %cond = icmp ne i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -493,43 +623,50 @@ define i32 @test_ctselect_icmp_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
 define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_icmp_slt:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    setl %al
-; X64-NEXT:    movl %eax, %esi
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    decl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    setl %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_icmp_slt:
 ; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    setl %al
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    setl %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_icmp_slt:
 ; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    setl %al
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %cond = icmp slt i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -539,39 +676,50 @@ define i32 @test_ctselect_icmp_slt(i32 %x, i32 %y, i32 %a, i32 %b) {
 define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
 ; X64-LABEL: test_ctselect_icmp_ult:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    cmpl %esi, %edi
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl %eax, %edx
-; X64-NEXT:    notl %eax
-; X64-NEXT:    andl %ecx, %eax
-; X64-NEXT:    orl %edx, %eax
+; X64-NEXT:    setb %cl
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovnel %edx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_icmp_ult:
 ; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    andl %eax, %ecx
-; X32-NEXT:    notl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
+; X32-NEXT:    setb %cl
+; X32-NEXT:    testb %cl, %cl
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_icmp_ult:
 ; X32-NOCMOV:       # %bb.0:
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    xorl %eax, %eax
-; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    sbbl %eax, %eax
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    andl %eax, %ecx
-; X32-NOCMOV-NEXT:    notl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    setb %al
+; X32-NOCMOV-NEXT:    testb %al, %al
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
+; X32-NOCMOV-NEXT:    negl %esi
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
+; X32-NOCMOV-NEXT:    orl %esi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %cond = icmp ult i32 %x, %y
   %result = call i32 @llvm.ct.select.i32(i1 %cond, i32 %a, i32 %b)
@@ -581,45 +729,64 @@ define i32 @test_ctselect_icmp_ult(i32 %x, i32 %y, i32 %a, i32 %b) {
 define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
 ; X64-LABEL: test_ctselect_fcmp_oeq:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %xmm3, %eax
-; X64-NEXT:    cmpeqss %xmm1, %xmm0
-; X64-NEXT:    movd %xmm0, %ecx
-; X64-NEXT:    pand %xmm2, %xmm0
-; X64-NEXT:    movd %xmm0, %edx
-; X64-NEXT:    notl %ecx
-; X64-NEXT:    andl %eax, %ecx
-; X64-NEXT:    orl %edx, %ecx
+; X64-NEXT:    movd %xmm2, %eax
+; X64-NEXT:    movd %xmm3, %ecx
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    setnp %dl
+; X64-NEXT:    sete %sil
+; X64-NEXT:    testb %dl, %sil
+; X64-NEXT:    cmovnel %eax, %ecx
 ; X64-NEXT:    movd %ecx, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_fcmp_oeq:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %eax
+; X32-NEXT:    pushl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    .cfi_def_cfa_offset 16
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    .cfi_offset %edi, -8
 ; X32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NEXT:    fucompi %st(1), %st
 ; X32-NEXT:    fstp %st(0)
 ; X32-NEXT:    setnp %al
 ; X32-NEXT:    sete %cl
-; X32-NEXT:    andb %al, %cl
-; X32-NEXT:    movzbl %cl, %eax
-; X32-NEXT:    movl %eax, %ecx
-; X32-NEXT:    negl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %ecx, %eax
-; X32-NEXT:    movl %eax, (%esp)
+; X32-NEXT:    testb %al, %cl
+; X32-NEXT:    sete %al
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movb %al, %ah
+; X32-NEXT:    movzbl %ah, %edi
+; X32-NEXT:    negl %edi
+; X32-NEXT:    movl %edx, %esi
+; X32-NEXT:    andl %edi, %esi
+; X32-NEXT:    notl %edi
+; X32-NEXT:    andl %ecx, %edi
+; X32-NEXT:    orl %edi, %esi
+; X32-NEXT:    movl %esi, (%esp)
 ; X32-NEXT:    flds (%esp)
-; X32-NEXT:    popl %eax
+; X32-NEXT:    addl $4, %esp
+; X32-NEXT:    .cfi_def_cfa_offset 12
+; X32-NEXT:    popl %esi
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    popl %edi
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_fcmp_oeq:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %eax
+; X32-NOCMOV-NEXT:    pushl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %eax
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -8
 ; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NOCMOV-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NOCMOV-NEXT:    fucompp
@@ -628,17 +795,25 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
 ; X32-NOCMOV-NEXT:    sahf
 ; X32-NOCMOV-NEXT:    setnp %al
 ; X32-NOCMOV-NEXT:    sete %cl
-; X32-NOCMOV-NEXT:    andb %al, %cl
-; X32-NOCMOV-NEXT:    movzbl %cl, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %ecx
-; X32-NOCMOV-NEXT:    negl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %ecx, %eax
-; X32-NOCMOV-NEXT:    movl %eax, (%esp)
+; X32-NOCMOV-NEXT:    testb %al, %cl
+; X32-NOCMOV-NEXT:    sete %al
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    movb %al, %ah
+; X32-NOCMOV-NEXT:    movzbl %ah, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %ecx, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    movl %esi, (%esp)
 ; X32-NOCMOV-NEXT:    flds (%esp)
-; X32-NOCMOV-NEXT:    popl %eax
+; X32-NOCMOV-NEXT:    addl $4, %esp
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %edi
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %cond = fcmp oeq float %x, %y
@@ -650,51 +825,45 @@ define float @test_ctselect_fcmp_oeq(float %x, float %y, float %a, float %b) {
 define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
 ; X64-LABEL: test_ctselect_load:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leal -1(%rdi), %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl (%rsi), %ecx
-; X64-NEXT:    andl (%rdx), %eax
-; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel (%rsi), %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_load:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %esi, -8
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %esi
-; X32-NEXT:    negl %esi
-; X32-NEXT:    andl (%edx), %esi
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl (%ecx), %eax
-; X32-NEXT:    orl %esi, %eax
-; X32-NEXT:    popl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel (%ecx), %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_load:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    pushl %ebx
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
-; X32-NOCMOV-NEXT:    .cfi_offset %esi, -8
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %esi
+; X32-NOCMOV-NEXT:    movl (%ecx), %ecx
+; X32-NOCMOV-NEXT:    movl (%eax), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %esi
 ; X32-NOCMOV-NEXT:    negl %esi
-; X32-NOCMOV-NEXT:    andl (%edx), %esi
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl (%ecx), %eax
+; X32-NOCMOV-NEXT:    movl %edx, %eax
+; X32-NOCMOV-NEXT:    andl %esi, %eax
+; X32-NOCMOV-NEXT:    notl %esi
+; X32-NOCMOV-NEXT:    andl %ecx, %esi
 ; X32-NOCMOV-NEXT:    orl %esi, %eax
 ; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
 ; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %a = load i32, ptr %p1
@@ -707,62 +876,63 @@ define i32 @test_ctselect_load(i1 %cond, ptr %p1, ptr %p2) {
 define i32 @test_ctselect_nested(i1 %cond1, i1 %cond2, i32 %a, i32 %b, i32 %c) {
 ; X64-LABEL: test_ctselect_nested:
 ; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    andl $1, %esi
-; X64-NEXT:    leal -1(%rsi), %r9d
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    negl %eax
-; X64-NEXT:    andl %edx, %eax
-; X64-NEXT:    andl %ecx, %r9d
-; X64-NEXT:    orl %eax, %r9d
-; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    leal -1(%rdi), %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl %r9d, %ecx
-; X64-NEXT:    andl %r8d, %eax
-; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    movl %r8d, %eax
+; X64-NEXT:    testb $1, %sil
+; X64-NEXT:    cmovnel %edx, %ecx
+; X64-NEXT:    testb $1, %dil
+; X64-NEXT:    cmovnel %ecx, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: test_ctselect_nested:
 ; X32:       # %bb.0:
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    andl $1, %ecx
-; X32-NEXT:    movl %ecx, %edx
-; X32-NEXT:    negl %edx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    decl %ecx
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    orl %edx, %ecx
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    movl %eax, %edx
-; X32-NEXT:    negl %edx
-; X32-NEXT:    andl %ecx, %edx
-; X32-NEXT:    decl %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    orl %edx, %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    cmovnel %ecx, %eax
 ; X32-NEXT:    retl
 ;
 ; X32-NOCMOV-LABEL: test_ctselect_nested:
 ; X32-NOCMOV:       # %bb.0:
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    andl $1, %ecx
-; X32-NOCMOV-NEXT:    movl %ecx, %edx
-; X32-NOCMOV-NEXT:    negl %edx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X32-NOCMOV-NEXT:    decl %ecx
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X32-NOCMOV-NEXT:    orl %edx, %ecx
-; X32-NOCMOV-NEXT:    andl $1, %eax
-; X32-NOCMOV-NEXT:    movl %eax, %edx
-; X32-NOCMOV-NEXT:    negl %edx
-; X32-NOCMOV-NEXT:    andl %ecx, %edx
-; X32-NOCMOV-NEXT:    decl %eax
-; X32-NOCMOV-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NOCMOV-NEXT:    orl %edx, %eax
+; X32-NOCMOV-NEXT:    pushl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    pushl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    pushl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 16
+; X32-NOCMOV-NEXT:    .cfi_offset %esi, -16
+; X32-NOCMOV-NEXT:    .cfi_offset %edi, -12
+; X32-NOCMOV-NEXT:    .cfi_offset %ebx, -8
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %bl
+; X32-NOCMOV-NEXT:    movb %bl, %bh
+; X32-NOCMOV-NEXT:    movzbl %bh, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %edx, %esi
+; X32-NOCMOV-NEXT:    andl %edi, %esi
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %eax, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %esi
+; X32-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X32-NOCMOV-NEXT:    sete %dl
+; X32-NOCMOV-NEXT:    movb %dl, %dh
+; X32-NOCMOV-NEXT:    movzbl %dh, %edi
+; X32-NOCMOV-NEXT:    negl %edi
+; X32-NOCMOV-NEXT:    movl %ecx, %eax
+; X32-NOCMOV-NEXT:    andl %edi, %eax
+; X32-NOCMOV-NEXT:    notl %edi
+; X32-NOCMOV-NEXT:    andl %esi, %edi
+; X32-NOCMOV-NEXT:    orl %edi, %eax
+; X32-NOCMOV-NEXT:    popl %esi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 12
+; X32-NOCMOV-NEXT:    popl %edi
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 8
+; X32-NOCMOV-NEXT:    popl %ebx
+; X32-NOCMOV-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NOCMOV-NEXT:    retl
   %inner = call i32 @llvm.ct.select.i32(i1 %cond2, i32 %a, i32 %b)
   %result = call i32 @llvm.ct.select.i32(i1 %cond1, i32 %inner, i32 %c)

>From 4f620622c0aafa01754dfebf076391c103810c2d Mon Sep 17 00:00:00 2001
From: wizardengineer <juliuswoosebert at gmail.com>
Date: Wed, 5 Nov 2025 23:56:12 -0500
Subject: [PATCH 2/2] [LLVM][X86] Add f80 support for ct.select

Add special handling for x86_fp80 types in CTSELECT lowering by splitting
them into three 32-bit chunks, performing constant-time selection on each
chunk, and reassembling the result. This fixes crashes when compiling
tests with f80 types.

Also updated ctselect.ll to match current generic fallback implementation.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp   | 5300 +++++++++++----------
 llvm/lib/Target/X86/X86ISelLowering.h     | 3781 ++++++++-------
 llvm/lib/Target/X86/X86InstrInfo.cpp      |  919 ++--
 llvm/lib/Target/X86/X86InstrInfo.h        |   21 +-
 llvm/lib/Target/X86/X86TargetMachine.cpp  |   15 +-
 llvm/test/CodeGen/X86/ctselect-i386-fp.ll |  272 +-
 6 files changed, 5209 insertions(+), 5099 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 833afa717c32c..7c5de8a834d79 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29,9 +29,9 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -193,10 +193,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // We don't accept any truncstore of integer registers.
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
-  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
-  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
-  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
+  setTruncStoreAction(MVT::i32, MVT::i8, Expand);
+  setTruncStoreAction(MVT::i16, MVT::i8, Expand);
 
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
@@ -208,106 +208,106 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   // Integer absolute.
   if (Subtarget.canUseCMOV()) {
-    setOperationAction(ISD::ABS            , MVT::i16  , Custom);
-    setOperationAction(ISD::ABS            , MVT::i32  , Custom);
+    setOperationAction(ISD::ABS, MVT::i16, Custom);
+    setOperationAction(ISD::ABS, MVT::i32, Custom);
     if (Subtarget.is64Bit())
-      setOperationAction(ISD::ABS          , MVT::i64  , Custom);
+      setOperationAction(ISD::ABS, MVT::i64, Custom);
   }
 
   // Absolute difference.
   for (auto Op : {ISD::ABDS, ISD::ABDU}) {
-    setOperationAction(Op                  , MVT::i8   , Custom);
-    setOperationAction(Op                  , MVT::i16  , Custom);
-    setOperationAction(Op                  , MVT::i32  , Custom);
+    setOperationAction(Op, MVT::i8, Custom);
+    setOperationAction(Op, MVT::i16, Custom);
+    setOperationAction(Op, MVT::i32, Custom);
     if (Subtarget.is64Bit())
-     setOperationAction(Op                 , MVT::i64  , Custom);
+      setOperationAction(Op, MVT::i64, Custom);
   }
 
   // Signed saturation subtraction.
-  setOperationAction(ISD::SSUBSAT          , MVT::i8   , Custom);
-  setOperationAction(ISD::SSUBSAT          , MVT::i16  , Custom);
-  setOperationAction(ISD::SSUBSAT          , MVT::i32  , Custom);
+  setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
+  setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
+  setOperationAction(ISD::SSUBSAT, MVT::i32, Custom);
   if (Subtarget.is64Bit())
-    setOperationAction(ISD::SSUBSAT        , MVT::i64  , Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::i64, Custom);
 
   // Funnel shifts.
   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
     // For slow shld targets we only lower for code size.
     LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
 
-    setOperationAction(ShiftOp             , MVT::i8   , Custom);
-    setOperationAction(ShiftOp             , MVT::i16  , Custom);
-    setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);
+    setOperationAction(ShiftOp, MVT::i8, Custom);
+    setOperationAction(ShiftOp, MVT::i16, Custom);
+    setOperationAction(ShiftOp, MVT::i32, ShiftDoubleAction);
     if (Subtarget.is64Bit())
-      setOperationAction(ShiftOp           , MVT::i64  , ShiftDoubleAction);
+      setOperationAction(ShiftOp, MVT::i64, ShiftDoubleAction);
   }
 
   if (!Subtarget.useSoftFloat()) {
     // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
     // operation.
-    setOperationAction(ISD::UINT_TO_FP,        MVT::i8, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
-    setOperationAction(ISD::UINT_TO_FP,        MVT::i16, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
     // We have an algorithm for SSE2, and we turn this into a 64-bit
     // FILD or VCVTUSI2SS/SD for other targets.
-    setOperationAction(ISD::UINT_TO_FP,        MVT::i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
     // We have an algorithm for SSE2->double, and we turn this into a
     // 64-bit FILD followed by conditional FADD for other targets.
-    setOperationAction(ISD::UINT_TO_FP,        MVT::i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
 
     // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
     // this operation.
-    setOperationAction(ISD::SINT_TO_FP,        MVT::i8, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
     // SSE has no i16 to fp conversion, only i32. We promote in the handler
     // to allow f80 to use i16 and f64 to use i16 with sse1 only
-    setOperationAction(ISD::SINT_TO_FP,        MVT::i16, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
     // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
-    setOperationAction(ISD::SINT_TO_FP,        MVT::i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
     // are Legal, f80 is custom lowered.
-    setOperationAction(ISD::SINT_TO_FP,        MVT::i64, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
 
     // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
     // this operation.
-    setOperationAction(ISD::FP_TO_SINT,        MVT::i8,  Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
     // FIXME: This doesn't generate invalid exception when it should. PR44019.
-    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8,  Promote);
-    setOperationAction(ISD::FP_TO_SINT,        MVT::i16, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
-    setOperationAction(ISD::FP_TO_SINT,        MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
     // are Legal, f80 is custom lowered.
-    setOperationAction(ISD::FP_TO_SINT,        MVT::i64, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
 
     // Handle FP_TO_UINT by promoting the destination to a larger signed
     // conversion.
-    setOperationAction(ISD::FP_TO_UINT,        MVT::i8,  Promote);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
     // FIXME: This doesn't generate invalid exception when it should. PR44019.
-    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8,  Promote);
-    setOperationAction(ISD::FP_TO_UINT,        MVT::i16, Promote);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
     // FIXME: This doesn't generate invalid exception when it should. PR44019.
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
-    setOperationAction(ISD::FP_TO_UINT,        MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT,        MVT::i64, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
 
-    setOperationAction(ISD::LRINT,             MVT::f32, Custom);
-    setOperationAction(ISD::LRINT,             MVT::f64, Custom);
-    setOperationAction(ISD::LLRINT,            MVT::f32, Custom);
-    setOperationAction(ISD::LLRINT,            MVT::f64, Custom);
+    setOperationAction(ISD::LRINT, MVT::f32, Custom);
+    setOperationAction(ISD::LRINT, MVT::f64, Custom);
+    setOperationAction(ISD::LLRINT, MVT::f32, Custom);
+    setOperationAction(ISD::LLRINT, MVT::f64, Custom);
 
     if (!Subtarget.is64Bit()) {
-      setOperationAction(ISD::LRINT,  MVT::i64, Custom);
+      setOperationAction(ISD::LRINT, MVT::i64, Custom);
       setOperationAction(ISD::LLRINT, MVT::i64, Custom);
     }
   }
@@ -315,7 +315,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   if (Subtarget.hasSSE2()) {
     // Custom lowering for saturating float to int conversions.
     // We handle promotion to larger result types manually.
-    for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
+    for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) {
       setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
       setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
     }
@@ -348,17 +348,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
   if (!Subtarget.hasSSE2()) {
-    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
-    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
+    setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+    setOperationAction(ISD::BITCAST, MVT::i32, Expand);
     setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
     if (Subtarget.is64Bit()) {
-      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
+      setOperationAction(ISD::BITCAST, MVT::f64, Expand);
       // Without SSE, i64->f64 goes through memory.
-      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
+      setOperationAction(ISD::BITCAST, MVT::i64, Expand);
     }
   } else if (!Subtarget.is64Bit())
-    setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
+    setOperationAction(ISD::BITCAST, MVT::i64, Custom);
 
   // Scalar integer divide and remainder are lowered to use operations that
   // produce two results, to match the available instructions. This exposes
@@ -370,7 +370,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // (low) operations are left as Legal, as there are single-result
   // instructions for this in x86. Using the two-result multiply instructions
   // when both high and low results are needed must be arranged by dagcombine.
-  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+  for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::SDIV, VT, Expand);
@@ -379,47 +379,47 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UREM, VT, Expand);
   }
 
-  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
-  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
-  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
-                   MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
-    setOperationAction(ISD::BR_CC,     VT, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+  for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128, MVT::i8, MVT::i16,
+                  MVT::i32, MVT::i64}) {
+    setOperationAction(ISD::BR_CC, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
   }
   if (Subtarget.is64Bit())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
-  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
-  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
-  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
-  setOperationAction(ISD::FREM             , MVT::f128 , Expand);
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+  setOperationAction(ISD::FREM, MVT::f80, Expand);
+  setOperationAction(ISD::FREM, MVT::f128, Expand);
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
-    setOperationAction(ISD::GET_ROUNDING   , MVT::i32  , Custom);
-    setOperationAction(ISD::SET_ROUNDING   , MVT::Other, Custom);
-    setOperationAction(ISD::GET_FPENV_MEM  , MVT::Other, Custom);
-    setOperationAction(ISD::SET_FPENV_MEM  , MVT::Other, Custom);
-    setOperationAction(ISD::RESET_FPENV    , MVT::Other, Custom);
+    setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
+    setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
+    setOperationAction(ISD::GET_FPENV_MEM, MVT::Other, Custom);
+    setOperationAction(ISD::SET_FPENV_MEM, MVT::Other, Custom);
+    setOperationAction(ISD::RESET_FPENV, MVT::Other, Custom);
   }
 
   // Promote the i8 variants and force them on up to i32 which has a shorter
   // encoding.
-  setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
-  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
+  setOperationPromotedToType(ISD::CTTZ, MVT::i8, MVT::i32);
+  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8, MVT::i32);
   // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
   // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
   // promote that too.
-  setOperationPromotedToType(ISD::CTTZ           , MVT::i16  , MVT::i32);
-  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , MVT::i32);
+  setOperationPromotedToType(ISD::CTTZ, MVT::i16, MVT::i32);
+  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
 
   if (!Subtarget.hasBMI()) {
-    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
+    setOperationAction(ISD::CTTZ, MVT::i32, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
     if (Subtarget.is64Bit()) {
-      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
+      setOperationAction(ISD::CTTZ, MVT::i64, Custom);
       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
     }
   }
@@ -427,13 +427,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   if (Subtarget.hasLZCNT()) {
     // When promoting the i8 variants, force them to i32 for a shorter
     // encoding.
-    setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
-    setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
+    setOperationPromotedToType(ISD::CTLZ, MVT::i8, MVT::i32);
+    setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8, MVT::i32);
   } else {
     for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
       if (VT == MVT::i64 && !Subtarget.is64Bit())
         continue;
-      setOperationAction(ISD::CTLZ           , VT, Custom);
+      setOperationAction(ISD::CTLZ, VT, Custom);
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
     }
   }
@@ -478,31 +478,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // on the dest that popcntl hasn't had since Cannon Lake.
     setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
   } else {
-    setOperationAction(ISD::CTPOP          , MVT::i8   , Custom);
-    setOperationAction(ISD::CTPOP          , MVT::i16  , Custom);
-    setOperationAction(ISD::CTPOP          , MVT::i32  , Custom);
-    setOperationAction(ISD::CTPOP          , MVT::i64  , Custom);
+    setOperationAction(ISD::CTPOP, MVT::i8, Custom);
+    setOperationAction(ISD::CTPOP, MVT::i16, Custom);
+    setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+    setOperationAction(ISD::CTPOP, MVT::i64, Custom);
   }
 
-  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
 
   if (!Subtarget.hasMOVBE())
-    setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
+    setOperationAction(ISD::BSWAP, MVT::i16, Expand);
 
   // X86 wants to expand cmov itself.
-  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
+  for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
     setOperationAction(ISD::SELECT, VT, Custom);
     setOperationAction(ISD::CTSELECT, VT, Custom);
     setOperationAction(ISD::SETCC, VT, Custom);
     setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
   }
-  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+  for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::SELECT, VT, Custom);
     setOperationAction(ISD::CTSELECT, VT, Custom);
-    setOperationAction(ISD::SETCC,  VT, Custom);
+    setOperationAction(ISD::SETCC, VT, Custom);
   }
 
   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
@@ -510,7 +510,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::CTSELECT, MVT::x86mmx, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
 
-  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+  setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
   // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
@@ -518,19 +518,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
 
   // Darwin ABI issue.
-  for (auto VT : { MVT::i32, MVT::i64 }) {
+  for (auto VT : {MVT::i32, MVT::i64}) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
-    setOperationAction(ISD::ConstantPool    , VT, Custom);
-    setOperationAction(ISD::JumpTable       , VT, Custom);
-    setOperationAction(ISD::GlobalAddress   , VT, Custom);
+    setOperationAction(ISD::ConstantPool, VT, Custom);
+    setOperationAction(ISD::JumpTable, VT, Custom);
+    setOperationAction(ISD::GlobalAddress, VT, Custom);
     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
-    setOperationAction(ISD::ExternalSymbol  , VT, Custom);
-    setOperationAction(ISD::BlockAddress    , VT, Custom);
+    setOperationAction(ISD::ExternalSymbol, VT, Custom);
+    setOperationAction(ISD::BlockAddress, VT, Custom);
   }
 
   // 64-bit shl, sra, srl (iff 32-bit x86)
-  for (auto VT : { MVT::i32, MVT::i64 }) {
+  for (auto VT : {MVT::i32, MVT::i64}) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::SHL_PARTS, VT, Custom);
@@ -539,12 +539,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   if (Subtarget.hasSSEPrefetch())
-    setOperationAction(ISD::PREFETCH      , MVT::Other, Custom);
+    setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
-  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
 
   // Expand certain atomics
-  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+  for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
@@ -588,14 +588,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
-  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
-  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
   bool Is64Bit = Subtarget.is64Bit();
-  setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
+  setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
 
-  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
-  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
 
   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
 
@@ -605,7 +605,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
 
-  auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
+  auto setF16Action = [&](MVT VT, LegalizeAction Action) {
     setOperationAction(ISD::FABS, VT, Action);
     setOperationAction(ISD::FNEG, VT, Action);
     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
@@ -661,7 +661,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // non-optsize case.
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
 
-    for (auto VT : { MVT::f32, MVT::f64 }) {
+    for (auto VT : {MVT::f32, MVT::f64}) {
       // Use ANDPD to simulate FABS.
       setOperationAction(ISD::FABS, VT, Custom);
 
@@ -676,8 +676,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FSUB, VT, Custom);
 
       // We don't support sin/cos/fmod
-      setOperationAction(ISD::FSIN   , VT, Expand);
-      setOperationAction(ISD::FCOS   , VT, Expand);
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
       setOperationAction(ISD::FSINCOS, VT, Expand);
     }
 
@@ -740,10 +740,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 
     // Use ANDPS to simulate FABS.
-    setOperationAction(ISD::FABS , MVT::f32, Custom);
+    setOperationAction(ISD::FABS, MVT::f32, Custom);
 
     // Use XORP to simulate FNEG.
-    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+    setOperationAction(ISD::FNEG, MVT::f32, Custom);
 
     if (UseX87)
       setOperationAction(ISD::UNDEF, MVT::f64, Expand);
@@ -754,8 +754,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 
     // We don't support sin/cos/fmod
-    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
-    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::f32, Expand);
     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
     if (UseX87) {
@@ -770,13 +770,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 
-    for (auto VT : { MVT::f32, MVT::f64 }) {
-      setOperationAction(ISD::UNDEF,     VT, Expand);
+    for (auto VT : {MVT::f32, MVT::f64}) {
+      setOperationAction(ISD::UNDEF, VT, Expand);
       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 
       // Always expand sin/cos functions even though x87 has an instruction.
-      setOperationAction(ISD::FSIN   , VT, Expand);
-      setOperationAction(ISD::FCOS   , VT, Expand);
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
       setOperationAction(ISD::FSINCOS, VT, Expand);
     }
   }
@@ -788,7 +788,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       addLegalFPImmediate(APFloat(+1.0f)); // FLD1
       addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
       addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
-    } else // SSE immediates.
+    } else                                 // SSE immediates.
       addLegalFPImmediate(APFloat(+0.0f)); // xorps
   }
   // Expand FP64 immediates into loads from the stack, save special cases.
@@ -798,7 +798,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       addLegalFPImmediate(APFloat(+1.0)); // FLD1
       addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
       addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
-    } else // SSE immediates.
+    } else                                // SSE immediates.
       addLegalFPImmediate(APFloat(+0.0)); // xorpd
   }
   // Support fp16 0 immediate.
@@ -806,18 +806,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
 
   // Handle constrained floating-point operations of scalar.
-  setOperationAction(ISD::STRICT_FADD,      MVT::f32, Legal);
-  setOperationAction(ISD::STRICT_FADD,      MVT::f64, Legal);
-  setOperationAction(ISD::STRICT_FSUB,      MVT::f32, Legal);
-  setOperationAction(ISD::STRICT_FSUB,      MVT::f64, Legal);
-  setOperationAction(ISD::STRICT_FMUL,      MVT::f32, Legal);
-  setOperationAction(ISD::STRICT_FMUL,      MVT::f64, Legal);
-  setOperationAction(ISD::STRICT_FDIV,      MVT::f32, Legal);
-  setOperationAction(ISD::STRICT_FDIV,      MVT::f64, Legal);
-  setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f32, Legal);
-  setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f64, Legal);
-  setOperationAction(ISD::STRICT_FSQRT,     MVT::f32, Legal);
-  setOperationAction(ISD::STRICT_FSQRT,     MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
 
   // We don't support FMA.
   setOperationAction(ISD::FMA, MVT::f64, Expand);
@@ -826,21 +826,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // f80 always uses X87.
   if (UseX87) {
     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
-    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
+    setOperationAction(ISD::UNDEF, MVT::f80, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
     {
       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
-      addLegalFPImmediate(TmpFlt);  // FLD0
+      addLegalFPImmediate(TmpFlt); // FLD0
       TmpFlt.changeSign();
-      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
+      addLegalFPImmediate(TmpFlt); // FLD0/FCHS
 
       bool ignored;
       APFloat TmpFlt2(+1.0);
-      TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
-                      &ignored);
-      addLegalFPImmediate(TmpFlt2);  // FLD1
+      TmpFlt2.convert(APFloat::x87DoubleExtended(),
+                      APFloat::rmNearestTiesToEven, &ignored);
+      addLegalFPImmediate(TmpFlt2); // FLD1
       TmpFlt2.changeSign();
-      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
+      addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
     }
 
     // Always expand sin/cos functions even though x87 has an instruction.
@@ -859,9 +859,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // clang-format on
 
     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
-    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
+    setOperationAction(ISD::FCEIL, MVT::f80, Expand);
     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
-    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
+    setOperationAction(ISD::FRINT, MVT::f80, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
     setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
     setOperationAction(ISD::FMA, MVT::f80, Expand);
@@ -871,12 +871,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::LLRINT, MVT::f80, Custom);
 
     // Handle constrained floating-point operations of scalar.
-    setOperationAction(ISD::STRICT_FADD     , MVT::f80, Legal);
-    setOperationAction(ISD::STRICT_FSUB     , MVT::f80, Legal);
-    setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
-    setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
-    setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
-    setOperationAction(ISD::FCANONICALIZE   , MVT::f80, Custom);
+    setOperationAction(ISD::STRICT_FADD, MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::f80, Legal);
+    setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom);
     if (isTypeLegal(MVT::f16)) {
       setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
@@ -895,16 +895,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
 
-    setOperationAction(ISD::FADD,        MVT::f128, LibCall);
+    setOperationAction(ISD::FADD, MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
-    setOperationAction(ISD::FSUB,        MVT::f128, LibCall);
+    setOperationAction(ISD::FSUB, MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
-    setOperationAction(ISD::FDIV,        MVT::f128, LibCall);
+    setOperationAction(ISD::FDIV, MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
-    setOperationAction(ISD::FMUL,        MVT::f128, LibCall);
+    setOperationAction(ISD::FMUL, MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
-    setOperationAction(ISD::FMA,         MVT::f128, LibCall);
-    setOperationAction(ISD::STRICT_FMA,  MVT::f128, LibCall);
+    setOperationAction(ISD::FMA, MVT::f128, LibCall);
+    setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
 
     setOperationAction(ISD::FABS, MVT::f128, Custom);
     setOperationAction(ISD::FNEG, MVT::f128, Custom);
@@ -920,10 +920,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FTAN,  MVT::f128, LibCall);
     // clang-format on
     // No STRICT_FSINCOS
-    setOperationAction(ISD::FSQRT,        MVT::f128, LibCall);
+    setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
 
-    setOperationAction(ISD::FP_EXTEND,        MVT::f128, Custom);
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
     // We need to custom handle any FP_ROUND with an f128 input, but
     // LegalizeDAG uses the result type to know when to run a custom handler.
@@ -953,10 +953,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   // Always use a library call for pow.
-  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
-  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
-  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
-  setOperationAction(ISD::FPOW             , MVT::f128 , Expand);
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+  setOperationAction(ISD::FPOW, MVT::f80, Expand);
+  setOperationAction(ISD::FPOW, MVT::f128, Expand);
 
   setOperationAction(ISD::FLOG, MVT::f80, Expand);
   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
@@ -968,9 +968,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 
   // Some FP actions are always expanded for vector types.
-  for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
-                   MVT::v4f32, MVT::v8f32,  MVT::v16f32,
-                   MVT::v2f64, MVT::v4f64,  MVT::v8f64 }) {
+  for (auto VT : {MVT::v8f16, MVT::v16f16, MVT::v32f16, MVT::v4f32, MVT::v8f32,
+                  MVT::v16f32, MVT::v2f64, MVT::v4f64, MVT::v8f64}) {
     // clang-format off
     setOperationAction(ISD::FSIN,      VT, Expand);
     setOperationAction(ISD::FSINCOS,   VT, Expand);
@@ -996,11 +995,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UDIV, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
-    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
-    setOperationAction(ISD::FMA,  VT, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Expand);
+    setOperationAction(ISD::INSERT_SUBVECTOR, VT, Expand);
+    setOperationAction(ISD::FMA, VT, Expand);
     setOperationAction(ISD::FFLOOR, VT, Expand);
     setOperationAction(ISD::FCEIL, VT, Expand);
     setOperationAction(ISD::FTRUNC, VT, Expand);
@@ -1024,7 +1023,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
     setOperationAction(ISD::TRUNCATE, VT, Expand);
     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
@@ -1062,31 +1061,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
 
-    setOperationAction(ISD::FMAXIMUM,           MVT::f32, Custom);
-    setOperationAction(ISD::FMINIMUM,           MVT::f32, Custom);
-    setOperationAction(ISD::FMAXIMUMNUM,        MVT::f32, Custom);
-    setOperationAction(ISD::FMINIMUMNUM,        MVT::f32, Custom);
-
-    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
-    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
-    setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
+    setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
+    setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
+    setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
+    setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
+
+    setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS, MVT::v4f32, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+    setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+    setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
     setOperationAction(ISD::CTSELECT, MVT::v4f32, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom);
 
-    setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
-    setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
+    setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
+    setOperationAction(ISD::STORE, MVT::v2f32, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Custom);
 
-    setOperationAction(ISD::STRICT_FADD,        MVT::v4f32, Legal);
-    setOperationAction(ISD::STRICT_FSUB,        MVT::v4f32, Legal);
-    setOperationAction(ISD::STRICT_FMUL,        MVT::v4f32, Legal);
-    setOperationAction(ISD::STRICT_FDIV,        MVT::v4f32, Legal);
-    setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -1106,74 +1105,74 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
 
-    for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
+    for (auto VT : {MVT::f64, MVT::v4f32, MVT::v2f64}) {
       setOperationAction(ISD::FMAXIMUM, VT, Custom);
       setOperationAction(ISD::FMINIMUM, VT, Custom);
       setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
       setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
     }
 
-    for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
-                     MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
+    for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16,
+                    MVT::v2i32}) {
       setOperationAction(ISD::SDIV, VT, Custom);
       setOperationAction(ISD::SREM, VT, Custom);
       setOperationAction(ISD::UDIV, VT, Custom);
       setOperationAction(ISD::UREM, VT, Custom);
     }
 
-    setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
-    setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);
-    setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);
-
-    setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
-    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
-    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
-    setOperationAction(ISD::MULHU,              MVT::v4i32, Custom);
-    setOperationAction(ISD::MULHS,              MVT::v4i32, Custom);
-    setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
-    setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
-    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
-    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
-    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
-    setOperationAction(ISD::AVGCEILU,           MVT::v16i8, Legal);
-    setOperationAction(ISD::AVGCEILU,           MVT::v8i16, Legal);
-
-    setOperationAction(ISD::SMULO,              MVT::v16i8, Custom);
-    setOperationAction(ISD::UMULO,              MVT::v16i8, Custom);
-    setOperationAction(ISD::UMULO,              MVT::v2i32, Custom);
-
-    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i8, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i8, Custom);
+    setOperationAction(ISD::MUL, MVT::v8i8, Custom);
+
+    setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+    setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
+    setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
+    setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
+    setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
+    setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
+    setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
+    setOperationAction(ISD::MUL, MVT::v8i16, Legal);
+    setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
+    setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
+
+    setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
+    setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
+    setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
+
+    setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Custom);
-    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
-    setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS, MVT::v2f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
 
     setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
     setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
 
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
       setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
     }
 
-    setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::SSUBSAT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::UADDSAT,            MVT::v8i16, Legal);
-    setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);
-    setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);
-    setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);
-    setOperationAction(ISD::USUBSAT,            MVT::v4i32, Custom);
-    setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
+    setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
+    setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
+    setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
+    setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
+    setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
+    setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
+    setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
+    setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
+    setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
 
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
       setOperationAction(ISD::SETCC, VT, Custom);
       setOperationAction(ISD::CTPOP, VT, Custom);
       setOperationAction(ISD::ABS, VT, Custom);
@@ -1186,30 +1185,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
-    setOperationAction(ISD::SETCC,          MVT::v2f64, Custom);
-    setOperationAction(ISD::SETCC,          MVT::v4f32, Custom);
-    setOperationAction(ISD::STRICT_FSETCC,  MVT::v2f64, Custom);
-    setOperationAction(ISD::STRICT_FSETCC,  MVT::v4f32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
+    setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::v2f64, Custom);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::v4f32, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, MVT::v2f64, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f32, Custom);
 
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-      setOperationAction(ISD::VSELECT,            VT, Custom);
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) {
+      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::VSELECT, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
-    for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-      setOperationAction(ISD::VSELECT,            VT, Custom);
+    for (auto VT : {MVT::v8f16, MVT::v2f64, MVT::v2i64}) {
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::VSELECT, VT, Custom);
 
       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
         continue;
 
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
     setF16Action(MVT::v8f16, Expand);
@@ -1222,12 +1221,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Custom);
 
     // Custom lower v2i64 and v2f64 selects.
-    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v4i32, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v8i16, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v8f16, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
+    setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
+    setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
+    setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
+    setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
+    setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
+    setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
 
     setOperationAction(ISD::CTSELECT, MVT::v2f64, Custom);
     setOperationAction(ISD::CTSELECT, MVT::v2i64, Custom);
@@ -1236,60 +1235,60 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CTSELECT, MVT::v8f16, Custom);
     setOperationAction(ISD::CTSELECT, MVT::v16i8, Custom);
 
-    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);
-    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v2i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
 
     // Custom legalize these to avoid over promotion or custom promotion.
     for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
-      setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
-      setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
+      setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+      setOperationAction(ISD::FP_TO_UINT, VT, Custom);
       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
     }
 
-    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Custom);
-    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v4i32, Custom);
-    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
-    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
 
-    setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
-    setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
 
-    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
-    setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v4i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
 
     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
-    setOperationAction(ISD::SINT_TO_FP,         MVT::v2f32, Custom);
-    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2f32, Custom);
-    setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
-    setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2f32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
 
-    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
-    setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v2f32, Custom);
-    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
-    setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v2f32, Custom);
+    setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
+    setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
 
     // We want to legalize this to an f64 load rather than an i64 load on
     // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
     // store.
-    setOperationAction(ISD::LOAD,               MVT::v2i32, Custom);
-    setOperationAction(ISD::LOAD,               MVT::v4i16, Custom);
-    setOperationAction(ISD::LOAD,               MVT::v8i8,  Custom);
-    setOperationAction(ISD::STORE,              MVT::v2i32, Custom);
-    setOperationAction(ISD::STORE,              MVT::v4i16, Custom);
-    setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);
+    setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+    setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
+    setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
+    setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+    setOperationAction(ISD::STORE, MVT::v4i16, Custom);
+    setOperationAction(ISD::STORE, MVT::v8i8, Custom);
 
     // Add 32-bit vector stores to help vectorization opportunities.
-    setOperationAction(ISD::STORE,              MVT::v2i16, Custom);
-    setOperationAction(ISD::STORE,              MVT::v4i8,  Custom);
+    setOperationAction(ISD::STORE, MVT::v2i16, Custom);
+    setOperationAction(ISD::STORE, MVT::v4i8, Custom);
 
-    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
-    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
-    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
+    setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
+    setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
+    setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
     if (!Subtarget.hasAVX512())
       setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
 
@@ -1299,41 +1298,42 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
 
-    setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v2i64, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v4i32, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v4i64, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v8i16, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v8i32, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v8i64, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v16i8, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v16i32, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v16i64, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i64, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v4i64, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
 
     // In the customized shift lowering, the legal v4i32/v2i64 cases
     // in AVX2 will be recognized.
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
-      setOperationAction(ISD::SRL,              VT, Custom);
-      setOperationAction(ISD::SHL,              VT, Custom);
-      setOperationAction(ISD::SRA,              VT, Custom);
-      if (VT == MVT::v2i64) continue;
-      setOperationAction(ISD::ROTL,             VT, Custom);
-      setOperationAction(ISD::ROTR,             VT, Custom);
-      setOperationAction(ISD::FSHL,             VT, Custom);
-      setOperationAction(ISD::FSHR,             VT, Custom);
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+      if (VT == MVT::v2i64)
+        continue;
+      setOperationAction(ISD::ROTL, VT, Custom);
+      setOperationAction(ISD::ROTR, VT, Custom);
+      setOperationAction(ISD::FSHL, VT, Custom);
+      setOperationAction(ISD::FSHR, VT, Custom);
     }
 
-    setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);
-    setOperationAction(ISD::STRICT_FADD,        MVT::v2f64, Legal);
-    setOperationAction(ISD::STRICT_FSUB,        MVT::v2f64, Legal);
-    setOperationAction(ISD::STRICT_FMUL,        MVT::v2f64, Legal);
-    setOperationAction(ISD::STRICT_FDIV,        MVT::v2f64, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) {
@@ -1348,73 +1348,73 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
-    setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
-    setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
-    setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
+    setOperationAction(ISD::ABS, MVT::v16i8, Legal);
+    setOperationAction(ISD::ABS, MVT::v8i16, Legal);
+    setOperationAction(ISD::ABS, MVT::v4i32, Legal);
 
     for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
-      setOperationAction(ISD::BITREVERSE,       VT, Custom);
-      setOperationAction(ISD::CTLZ,             VT, Custom);
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
+      setOperationAction(ISD::CTLZ, VT, Custom);
     }
 
     // These might be better off as horizontal vector ops.
-    setOperationAction(ISD::ADD,                MVT::i16, Custom);
-    setOperationAction(ISD::ADD,                MVT::i32, Custom);
-    setOperationAction(ISD::SUB,                MVT::i16, Custom);
-    setOperationAction(ISD::SUB,                MVT::i32, Custom);
+    setOperationAction(ISD::ADD, MVT::i16, Custom);
+    setOperationAction(ISD::ADD, MVT::i32, Custom);
+    setOperationAction(ISD::SUB, MVT::i16, Custom);
+    setOperationAction(ISD::SUB, MVT::i32, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
-      setOperationAction(ISD::FFLOOR,            RoundedTy,  Legal);
-      setOperationAction(ISD::STRICT_FFLOOR,     RoundedTy,  Legal);
-      setOperationAction(ISD::FCEIL,             RoundedTy,  Legal);
-      setOperationAction(ISD::STRICT_FCEIL,      RoundedTy,  Legal);
-      setOperationAction(ISD::FTRUNC,            RoundedTy,  Legal);
-      setOperationAction(ISD::STRICT_FTRUNC,     RoundedTy,  Legal);
-      setOperationAction(ISD::FRINT,             RoundedTy,  Legal);
-      setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
-      setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
-      setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
-      setOperationAction(ISD::FROUNDEVEN,        RoundedTy,  Legal);
-      setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy,  Legal);
-
-      setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
-    }
-
-    setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
-    setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
-    setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
-    setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
-    setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
-    setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
-    setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
-    setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
-
-    setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
-    setOperationAction(ISD::SADDSAT,            MVT::v2i64, Custom);
-    setOperationAction(ISD::SSUBSAT,            MVT::v2i64, Custom);
+      setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
+      setOperationAction(ISD::FCEIL, RoundedTy, Legal);
+      setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
+      setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
+      setOperationAction(ISD::FRINT, RoundedTy, Legal);
+      setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
+      setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+      setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
+      setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
+      setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
+
+      setOperationAction(ISD::FROUND, RoundedTy, Custom);
+    }
+
+    setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
+    setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
+    setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
+    setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
+    setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
+    setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
+    setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
+    setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
+
+    setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
+    setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
 
     // FIXME: Do we need to handle scalar-to-vector here?
-    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
-    setOperationAction(ISD::SMULO,              MVT::v2i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+    setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
 
     // We directly match byte blends in the backend as they match the VSELECT
     // condition form.
-    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
+    setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
 
     // SSE41 brings specific instructions for doing vector sign extend even in
     // cases where we don't have SRA.
-    for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+    for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
     }
 
     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
-    for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
-      setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
-      setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
-      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
+    for (auto LoadExtOp : {ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
+      setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
+      setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
+      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
@@ -1423,73 +1423,73 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
       // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
       // do the pre and post work in the vector domain.
-      setOperationAction(ISD::UINT_TO_FP,        MVT::v4i64, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
       // We need to mark SINT_TO_FP as Custom even though we want to expand it
       // so that DAG combine doesn't try to turn it into uint_to_fp.
-      setOperationAction(ISD::SINT_TO_FP,        MVT::v4i64, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
     }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
-    setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
-    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
-                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8,
+                    MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
       setOperationAction(ISD::ROTL, VT, Custom);
       setOperationAction(ISD::ROTR, VT, Custom);
     }
 
     // XOP can efficiently perform BITREVERSE with VPPERM.
-    for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
+    for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64})
       setOperationAction(ISD::BITREVERSE, VT, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
     bool HasInt256 = Subtarget.hasInt256();
 
-    addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
-                                                     : &X86::VR256RegClass);
+    addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                    : &X86::VR256RegClass);
     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
-    addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
-                                                     : &X86::VR256RegClass);
-    addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
-                                                     : &X86::VR256RegClass);
-    addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
-                                                     : &X86::VR256RegClass);
-    addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
-                                                     : &X86::VR256RegClass);
-
-    for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
-      setOperationAction(ISD::FFLOOR,            VT, Legal);
-      setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
-      setOperationAction(ISD::FCEIL,             VT, Legal);
-      setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
-      setOperationAction(ISD::FTRUNC,            VT, Legal);
-      setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
-      setOperationAction(ISD::FRINT,             VT, Legal);
-      setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
-      setOperationAction(ISD::FNEARBYINT,        VT, Legal);
+    addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                    : &X86::VR256RegClass);
+    addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                    : &X86::VR256RegClass);
+    addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                    : &X86::VR256RegClass);
+    addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                    : &X86::VR256RegClass);
+
+    for (auto VT : {MVT::v8f32, MVT::v4f64}) {
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
+      setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+      setOperationAction(ISD::FRINT, VT, Legal);
+      setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+      setOperationAction(ISD::FNEARBYINT, VT, Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
-      setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
 
-      setOperationAction(ISD::FROUND,            VT, Custom);
+      setOperationAction(ISD::FROUND, VT, Custom);
 
-      setOperationAction(ISD::FNEG,              VT, Custom);
-      setOperationAction(ISD::FABS,              VT, Custom);
-      setOperationAction(ISD::FCOPYSIGN,         VT, Custom);
+      setOperationAction(ISD::FNEG, VT, Custom);
+      setOperationAction(ISD::FABS, VT, Custom);
+      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
 
-      setOperationAction(ISD::FMAXIMUM,          VT, Custom);
-      setOperationAction(ISD::FMINIMUM,          VT, Custom);
-      setOperationAction(ISD::FMAXIMUMNUM,       VT, Custom);
-      setOperationAction(ISD::FMINIMUMNUM,       VT, Custom);
+      setOperationAction(ISD::FMAXIMUM, VT, Custom);
+      setOperationAction(ISD::FMINIMUM, VT, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
       setOperationAction(ISD::FCANONICALIZE, VT, Custom);
     }
 
@@ -1498,64 +1498,65 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
     // even though v8i16 is a legal type.
-    setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
-    setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i16, MVT::v8i32);
+    setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
+    setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
-    setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT,                MVT::v8i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Custom);
-
-    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Custom);
-    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i32, Custom);
-    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Expand);
-    setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Expand);
-    setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);
-    setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Custom);
-
-    setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v4f32, Legal);
-    setOperationAction(ISD::STRICT_FADD,        MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FADD,        MVT::v4f64, Legal);
-    setOperationAction(ISD::STRICT_FSUB,        MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FSUB,        MVT::v4f64, Legal);
-    setOperationAction(ISD::STRICT_FMUL,        MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FMUL,        MVT::v4f64, Legal);
-    setOperationAction(ISD::STRICT_FDIV,        MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FDIV,        MVT::v4f64, Legal);
-    setOperationAction(ISD::STRICT_FSQRT,       MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f64, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);
+
+    setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);
+    setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
+    setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);
+
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
 
     if (!Subtarget.hasAVX512())
       setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
 
     // In the customized shift lowering, the legal v8i32/v4i64 cases
     // in AVX2 will be recognized.
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
-      setOperationAction(ISD::SRL,             VT, Custom);
-      setOperationAction(ISD::SHL,             VT, Custom);
-      setOperationAction(ISD::SRA,             VT, Custom);
-      setOperationAction(ISD::ABDS,            VT, Custom);
-      setOperationAction(ISD::ABDU,            VT, Custom);
-      if (VT == MVT::v4i64) continue;
-      setOperationAction(ISD::ROTL,            VT, Custom);
-      setOperationAction(ISD::ROTR,            VT, Custom);
-      setOperationAction(ISD::FSHL,            VT, Custom);
-      setOperationAction(ISD::FSHR,            VT, Custom);
+    for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+      setOperationAction(ISD::ABDS, VT, Custom);
+      setOperationAction(ISD::ABDU, VT, Custom);
+      if (VT == MVT::v4i64)
+        continue;
+      setOperationAction(ISD::ROTL, VT, Custom);
+      setOperationAction(ISD::ROTR, VT, Custom);
+      setOperationAction(ISD::FSHL, VT, Custom);
+      setOperationAction(ISD::FSHR, VT, Custom);
     }
 
     // These types need custom splitting if their input is a 128-bit vector.
-    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
-    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);
-    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);
-
-    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
-    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
-    setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);
-    setOperationAction(ISD::SELECT,            MVT::v16i16, Custom);
-    setOperationAction(ISD::SELECT,            MVT::v16f16, Custom);
-    setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
-    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+
+    setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
+    setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
+    setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
+    setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
+    setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
+    setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
+    setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
 
     setOperationAction(ISD::CTSELECT, MVT::v4f64, Custom);
     setOperationAction(ISD::CTSELECT, MVT::v4i64, Custom);
@@ -1565,22 +1566,22 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CTSELECT, MVT::v32i8, Custom);
     setOperationAction(ISD::CTSELECT, MVT::v8f32, Custom);
 
-    for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
-      setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
-      setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
-      setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
+    for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
+      setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+      setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+      setOperationAction(ISD::ANY_EXTEND, VT, Custom);
     }
 
-    setOperationAction(ISD::TRUNCATE,          MVT::v32i8, Custom);
-    setOperationAction(ISD::TRUNCATE,          MVT::v32i16, Custom);
-    setOperationAction(ISD::TRUNCATE,          MVT::v32i32, Custom);
-    setOperationAction(ISD::TRUNCATE,          MVT::v32i64, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v32i16, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v32i32, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v32i64, Custom);
 
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
-      setOperationAction(ISD::SETCC,           VT, Custom);
-      setOperationAction(ISD::CTPOP,           VT, Custom);
-      setOperationAction(ISD::CTLZ,            VT, Custom);
-      setOperationAction(ISD::BITREVERSE,      VT, Custom);
+    for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
+      setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::CTPOP, VT, Custom);
+      setOperationAction(ISD::CTLZ, VT, Custom);
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1588,64 +1589,64 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
-    setOperationAction(ISD::SETCC,          MVT::v4f64, Custom);
-    setOperationAction(ISD::SETCC,          MVT::v8f32, Custom);
-    setOperationAction(ISD::STRICT_FSETCC,  MVT::v4f64, Custom);
-    setOperationAction(ISD::STRICT_FSETCC,  MVT::v8f32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
+    setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::v4f64, Custom);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::v8f32, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f64, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f32, Custom);
 
     if (Subtarget.hasAnyFMA()) {
-      for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
-                       MVT::v2f64, MVT::v4f64 }) {
+      for (auto VT : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, MVT::v2f64,
+                      MVT::v4f64}) {
         setOperationAction(ISD::FMA, VT, Legal);
         setOperationAction(ISD::STRICT_FMA, VT, Legal);
       }
     }
 
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+    for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
     }
 
-    setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
-    setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
-
-    setOperationAction(ISD::MULHU,     MVT::v8i32,  Custom);
-    setOperationAction(ISD::MULHS,     MVT::v8i32,  Custom);
-    setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
-    setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
-    setOperationAction(ISD::AVGCEILU,  MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::AVGCEILU,  MVT::v32i8,  HasInt256 ? Legal : Custom);
-
-    setOperationAction(ISD::SMULO,     MVT::v32i8, Custom);
-    setOperationAction(ISD::UMULO,     MVT::v32i8, Custom);
-
-    setOperationAction(ISD::ABS,       MVT::v4i64,  Custom);
-    setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);
-    setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);
-    setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
-    setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
-
-    setOperationAction(ISD::UADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::SADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::USUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::SSUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::UADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::UADDSAT,   MVT::v8i32, Custom);
-    setOperationAction(ISD::USUBSAT,   MVT::v8i32, Custom);
-    setOperationAction(ISD::UADDSAT,   MVT::v4i64, Custom);
-    setOperationAction(ISD::USUBSAT,   MVT::v4i64, Custom);
-
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
-      setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+    setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MUL, MVT::v32i8, Custom);
+
+    setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
+    setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
+    setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
+    setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
+    setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
+
+    setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
+    setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
+
+    setOperationAction(ISD::ABS, MVT::v4i64, Custom);
+    setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
+    setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
+    setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
+    setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
+
+    setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
+    setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
+    setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
+    setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
+
+    for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32}) {
+      setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
@@ -1664,41 +1665,41 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
 
       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
-      for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
+      for (auto LoadExtOp : {ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
-        setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
-        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
-        setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
-        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
-        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
       }
     }
 
-    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
-                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
-      setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
+    for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32,
+                    MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
+      setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
       setOperationAction(ISD::MSTORE, VT, Legal);
     }
 
     // Extract subvector is special because the value type
     // (result) is 128-bit but the source is 256-bit wide.
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
-                     MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v8f16,
+                    MVT::v4f32, MVT::v2f64}) {
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
     }
 
     // Custom lower several nodes for 256-bit types.
-    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
-                    MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-      setOperationAction(ISD::VSELECT,            VT, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+    for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v16f16,
+                   MVT::v8f32, MVT::v4f64}) {
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::VSELECT, VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
-      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
-      setOperationAction(ISD::STORE,              VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::STORE, VT, Custom);
     }
     setF16Action(MVT::v16f16, Expand);
     setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
@@ -1716,21 +1717,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
       setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
 
-      for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
-                       MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
-        setOperationAction(ISD::MGATHER,  VT, Custom);
+      for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64})
+        setOperationAction(ISD::MGATHER, VT, Custom);
     }
   }
 
   if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
       Subtarget.hasF16C()) {
-    for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
-      setOperationAction(ISD::FP_ROUND,           VT, Custom);
-      setOperationAction(ISD::STRICT_FP_ROUND,    VT, Custom);
+    for (MVT VT : {MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16}) {
+      setOperationAction(ISD::FP_ROUND, VT, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
     }
-    for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
-      setOperationAction(ISD::FP_EXTEND,          VT, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,   VT, Custom);
+    for (MVT VT : {MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32}) {
+      setOperationAction(ISD::FP_EXTEND, VT, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
     }
     for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
       setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
@@ -1744,29 +1745,29 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // available with AVX512. 512-bit vectors are in a separate block controlled
   // by useAVX512Regs.
   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
-    addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
-    addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
-    addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
-    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
-    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
+    addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
+    addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
+    addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
+    addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
+    addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
 
-    setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
+    setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
     setOperationAction(ISD::CTSELECT, MVT::v1i1, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
-
-    setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i1,  MVT::v8i32);
-    setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i1,  MVT::v8i32);
-    setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v4i1,  MVT::v4i32);
-    setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v4i1,  MVT::v4i32);
-    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
-    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
-    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
-    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
-    setOperationAction(ISD::FP_TO_SINT,                MVT::v2i1,  Custom);
-    setOperationAction(ISD::FP_TO_UINT,                MVT::v2i1,  Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v2i1,  Custom);
-    setOperationAction(ISD::STRICT_FP_TO_UINT,         MVT::v2i1,  Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
+
+    setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+    setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+    setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+    setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Custom);
@@ -1785,30 +1786,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
 
     // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
       setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
       setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
-      setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
+      setOperationAction(ISD::ANY_EXTEND, VT, Custom);
     }
 
-    for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
-      setOperationAction(ISD::VSELECT,          VT, Expand);
+    for (auto VT : {MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1})
+      setOperationAction(ISD::VSELECT, VT, Expand);
 
-    for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
-      setOperationAction(ISD::SETCC,            VT, Custom);
-      setOperationAction(ISD::SELECT,           VT, Custom);
+    for (auto VT : {MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1}) {
+      setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::CTSELECT, VT, Custom);
-      setOperationAction(ISD::TRUNCATE,         VT, Custom);
+      setOperationAction(ISD::TRUNCATE, VT, Custom);
 
-      setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS,   VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
 
-    for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
+    for (auto VT : {MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1})
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   }
   if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
@@ -1826,30 +1827,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
-    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
-    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
+    addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
+    addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
     addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
-    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
+    addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
 
     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
-      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
+      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
-      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
-      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
-      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
+      setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
+      setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
+      setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
       if (HasBWI)
         setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
     }
 
-    for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
+    for (MVT VT : {MVT::v16f32, MVT::v8f64}) {
       setOperationAction(ISD::FMAXIMUM, VT, Custom);
       setOperationAction(ISD::FMINIMUM, VT, Custom);
       setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
       setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
-      setOperationAction(ISD::FNEG,  VT, Custom);
-      setOperationAction(ISD::FABS,  VT, Custom);
-      setOperationAction(ISD::FMA,   VT, Legal);
+      setOperationAction(ISD::FNEG, VT, Custom);
+      setOperationAction(ISD::FABS, VT, Custom);
+      setOperationAction(ISD::FMA, VT, Legal);
       setOperationAction(ISD::STRICT_FMA, VT, Legal);
       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
       setOperationAction(ISD::FCANONICALIZE, VT, Custom);
@@ -1861,93 +1862,93 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (Subtarget.hasDQI())
       setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
 
-    for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
-      setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
-      setOperationPromotedToType(ISD::FP_TO_UINT       , VT, MVT::v16i32);
+    for (MVT VT : {MVT::v16i1, MVT::v16i8}) {
+      setOperationPromotedToType(ISD::FP_TO_SINT, VT, MVT::v16i32);
+      setOperationPromotedToType(ISD::FP_TO_UINT, VT, MVT::v16i32);
       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
     }
 
-    for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
-      setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
-      setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
+    for (MVT VT : {MVT::v16i16, MVT::v16i32}) {
+      setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+      setOperationAction(ISD::FP_TO_UINT, VT, Custom);
       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
     }
 
-    setOperationAction(ISD::SINT_TO_FP,        MVT::v16i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP,        MVT::v16i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);
     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
-    setOperationAction(ISD::FP_EXTEND,         MVT::v8f64,  Custom);
-    setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v8f64,  Custom);
-
-    setOperationAction(ISD::STRICT_FADD,      MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FADD,      MVT::v8f64,  Legal);
-    setOperationAction(ISD::STRICT_FSUB,      MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FSUB,      MVT::v8f64,  Legal);
-    setOperationAction(ISD::STRICT_FMUL,      MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FMUL,      MVT::v8f64,  Legal);
-    setOperationAction(ISD::STRICT_FDIV,      MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FDIV,      MVT::v8f64,  Legal);
-    setOperationAction(ISD::STRICT_FSQRT,     MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FSQRT,     MVT::v8f64,  Legal);
-    setOperationAction(ISD::STRICT_FP_ROUND,  MVT::v8f32,  Legal);
-
-    setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
-    setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
-    setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
-    setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
-    setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
+    setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);
+
+    setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
+
+    setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
+    setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
+    setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
+    setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
+    setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
     if (HasBWI)
-      setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
+      setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
 
     // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
     // to 512-bit rather than use the AVX2 instructions so that we can use
     // k-masks.
     if (!Subtarget.hasVLX()) {
       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
-           MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
-        setOperationAction(ISD::MLOAD,  VT, Custom);
+                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
+        setOperationAction(ISD::MLOAD, VT, Custom);
         setOperationAction(ISD::MSTORE, VT, Custom);
       }
     }
 
-    setOperationAction(ISD::TRUNCATE,    MVT::v8i32,  Legal);
-    setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Legal);
-    setOperationAction(ISD::TRUNCATE,    MVT::v32i8,  HasBWI ? Legal : Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
+    setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
+    setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
     setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
-    setOperationAction(ISD::ANY_EXTEND,  MVT::v32i16, Custom);
-    setOperationAction(ISD::ANY_EXTEND,  MVT::v16i32, Custom);
-    setOperationAction(ISD::ANY_EXTEND,  MVT::v8i64,  Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
     setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
 
     if (HasBWI) {
       // Extends from v64i1 masks to 512-bit vectors.
-      setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
-      setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
-      setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
-    }
-
-    for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
-      setOperationAction(ISD::FFLOOR,            VT, Legal);
-      setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
-      setOperationAction(ISD::FCEIL,             VT, Legal);
-      setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
-      setOperationAction(ISD::FTRUNC,            VT, Legal);
-      setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
-      setOperationAction(ISD::FRINT,             VT, Legal);
-      setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
-      setOperationAction(ISD::FNEARBYINT,        VT, Legal);
+      setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
+      setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
+      setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
+    }
+
+    for (auto VT : {MVT::v16f32, MVT::v8f64}) {
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
+      setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+      setOperationAction(ISD::FRINT, VT, Legal);
+      setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+      setOperationAction(ISD::FNEARBYINT, VT, Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
-      setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
 
-      setOperationAction(ISD::FROUND,            VT, Custom);
+      setOperationAction(ISD::FROUND, VT, Custom);
     }
 
     for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
@@ -1957,36 +1958,36 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
     setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
-    setOperationAction(ISD::ADD, MVT::v64i8,  HasBWI ? Legal : Custom);
-    setOperationAction(ISD::SUB, MVT::v64i8,  HasBWI ? Legal : Custom);
+    setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
+    setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
 
-    setOperationAction(ISD::MUL, MVT::v8i64,  Custom);
+    setOperationAction(ISD::MUL, MVT::v8i64, Custom);
     setOperationAction(ISD::MUL, MVT::v16i32, Legal);
     setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
-    setOperationAction(ISD::MUL, MVT::v64i8,  Custom);
+    setOperationAction(ISD::MUL, MVT::v64i8, Custom);
 
     setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
     setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
     setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
     setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
-    setOperationAction(ISD::MULHS, MVT::v64i8,  Custom);
-    setOperationAction(ISD::MULHU, MVT::v64i8,  Custom);
+    setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
+    setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
     setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
-    setOperationAction(ISD::AVGCEILU, MVT::v64i8,  HasBWI ? Legal : Custom);
+    setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
 
     setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
     setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
 
-    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
-      setOperationAction(ISD::SRL,              VT, Custom);
-      setOperationAction(ISD::SHL,              VT, Custom);
-      setOperationAction(ISD::SRA,              VT, Custom);
-      setOperationAction(ISD::ROTL,             VT, Custom);
-      setOperationAction(ISD::ROTR,             VT, Custom);
-      setOperationAction(ISD::SETCC,            VT, Custom);
-      setOperationAction(ISD::ABDS,             VT, Custom);
-      setOperationAction(ISD::ABDU,             VT, Custom);
-      setOperationAction(ISD::BITREVERSE,       VT, Custom);
+    for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+      setOperationAction(ISD::ROTL, VT, Custom);
+      setOperationAction(ISD::ROTR, VT, Custom);
+      setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::ABDS, VT, Custom);
+      setOperationAction(ISD::ABDU, VT, Custom);
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1994,83 +1995,84 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
-    setOperationAction(ISD::SETCC,          MVT::v8f64, Custom);
-    setOperationAction(ISD::SETCC,          MVT::v16f32, Custom);
-    setOperationAction(ISD::STRICT_FSETCC,  MVT::v8f64, Custom);
-    setOperationAction(ISD::STRICT_FSETCC,  MVT::v16f32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
+    setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::v8f64, Custom);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::v16f32, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f64, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, MVT::v16f32, Custom);
 
-    for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
-      setOperationAction(ISD::SMAX,             VT, Legal);
-      setOperationAction(ISD::UMAX,             VT, Legal);
-      setOperationAction(ISD::SMIN,             VT, Legal);
-      setOperationAction(ISD::UMIN,             VT, Legal);
-      setOperationAction(ISD::ABS,              VT, Legal);
-      setOperationAction(ISD::CTPOP,            VT, Custom);
-    }
-
-    for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
-      setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
-      setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
-      setOperationAction(ISD::CTLZ,    VT, Custom);
-      setOperationAction(ISD::SMAX,    VT, HasBWI ? Legal : Custom);
-      setOperationAction(ISD::UMAX,    VT, HasBWI ? Legal : Custom);
-      setOperationAction(ISD::SMIN,    VT, HasBWI ? Legal : Custom);
-      setOperationAction(ISD::UMIN,    VT, HasBWI ? Legal : Custom);
+    for (auto VT : {MVT::v16i32, MVT::v8i64}) {
+      setOperationAction(ISD::SMAX, VT, Legal);
+      setOperationAction(ISD::UMAX, VT, Legal);
+      setOperationAction(ISD::SMIN, VT, Legal);
+      setOperationAction(ISD::UMIN, VT, Legal);
+      setOperationAction(ISD::ABS, VT, Legal);
+      setOperationAction(ISD::CTPOP, VT, Custom);
+    }
+
+    for (auto VT : {MVT::v64i8, MVT::v32i16}) {
+      setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::CTPOP, VT,
+                         Subtarget.hasBITALG() ? Legal : Custom);
+      setOperationAction(ISD::CTLZ, VT, Custom);
+      setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
       setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
       setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
       setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
       setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
     }
 
-    setOperationAction(ISD::FSHL,       MVT::v64i8, Custom);
-    setOperationAction(ISD::FSHR,       MVT::v64i8, Custom);
-    setOperationAction(ISD::FSHL,      MVT::v32i16, Custom);
-    setOperationAction(ISD::FSHR,      MVT::v32i16, Custom);
-    setOperationAction(ISD::FSHL,      MVT::v16i32, Custom);
-    setOperationAction(ISD::FSHR,      MVT::v16i32, Custom);
+    setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
+    setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
+    setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
+    setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
+    setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
+    setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
 
     if (Subtarget.hasDQI() || Subtarget.hasFP16())
       for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
                        ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
                        ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
-        setOperationAction(Opc,           MVT::v8i64, Custom);
+        setOperationAction(Opc, MVT::v8i64, Custom);
 
     if (Subtarget.hasDQI())
-      setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
+      setOperationAction(ISD::MUL, MVT::v8i64, Legal);
 
     if (Subtarget.hasCDI()) {
       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
-      for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
-        setOperationAction(ISD::CTLZ,            VT, Legal);
+      for (auto VT : {MVT::v16i32, MVT::v8i64}) {
+        setOperationAction(ISD::CTLZ, VT, Legal);
       }
     } // Subtarget.hasCDI()
 
     if (Subtarget.hasVPOPCNTDQ()) {
-      for (auto VT : { MVT::v16i32, MVT::v8i64 })
+      for (auto VT : {MVT::v16i32, MVT::v8i64})
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
 
     // Extract subvector is special because the value type
     // (result) is 256-bit but the source is 512-bit wide.
     // 128-bit was made Legal under AVX1.
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
-                     MVT::v16f16, MVT::v8f32, MVT::v4f64 })
+    for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+                    MVT::v16f16, MVT::v8f32, MVT::v4f64})
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
 
-    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
-                     MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
-      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
-      setOperationAction(ISD::SELECT,             VT, Custom);
+    for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
+                    MVT::v32f16, MVT::v16f32, MVT::v8f64}) {
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+      setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::CTSELECT, VT, Custom);
-      setOperationAction(ISD::VSELECT,            VT, Custom);
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VSELECT, VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
     }
     setF16Action(MVT::v32f16, Expand);
     setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
@@ -2081,20 +2083,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
     setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
 
-    for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
-      setOperationAction(ISD::MLOAD,               VT, Legal);
-      setOperationAction(ISD::MSTORE,              VT, Legal);
-      setOperationAction(ISD::MGATHER,             VT, Custom);
-      setOperationAction(ISD::MSCATTER,            VT, Custom);
+    for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64}) {
+      setOperationAction(ISD::MLOAD, VT, Legal);
+      setOperationAction(ISD::MSTORE, VT, Legal);
+      setOperationAction(ISD::MGATHER, VT, Custom);
+      setOperationAction(ISD::MSCATTER, VT, Custom);
     }
     if (HasBWI) {
-      for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
-        setOperationAction(ISD::MLOAD,        VT, Legal);
-        setOperationAction(ISD::MSTORE,       VT, Legal);
+      for (auto VT : {MVT::v64i8, MVT::v32i16}) {
+        setOperationAction(ISD::MLOAD, VT, Legal);
+        setOperationAction(ISD::MSTORE, VT, Legal);
       }
     } else {
       setOperationAction(ISD::STORE, MVT::v32i16, Custom);
-      setOperationAction(ISD::STORE, MVT::v64i8,  Custom);
+      setOperationAction(ISD::STORE, MVT::v64i8, Custom);
     }
 
     if (Subtarget.hasVBMI2()) {
@@ -2110,7 +2112,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
     setOperationAction(ISD::FABS, MVT::v32f16, Custom);
     setOperationAction(ISD::FCOPYSIGN, MVT::v32f16, Custom);
-  }// useAVX512Regs
+  } // useAVX512Regs
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
     for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
@@ -2127,9 +2129,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // These operations are handled on non-VLX by artificially widening in
     // isel patterns.
 
-    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v4i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v2i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
 
     if (Subtarget.hasDQI()) {
       // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
@@ -2138,31 +2140,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
              isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
              "Unexpected operation action!");
       // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
-      setOperationAction(ISD::FP_TO_SINT,        MVT::v2f32, Custom);
-      setOperationAction(ISD::FP_TO_UINT,        MVT::v2f32, Custom);
+      setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
     }
 
-    for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
+    for (auto VT : {MVT::v2i64, MVT::v4i64}) {
       setOperationAction(ISD::SMAX, VT, Legal);
       setOperationAction(ISD::UMAX, VT, Legal);
       setOperationAction(ISD::SMIN, VT, Legal);
       setOperationAction(ISD::UMIN, VT, Legal);
-      setOperationAction(ISD::ABS,  VT, Legal);
+      setOperationAction(ISD::ABS, VT, Legal);
     }
 
-    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
-      setOperationAction(ISD::ROTL,     VT, Custom);
-      setOperationAction(ISD::ROTR,     VT, Custom);
+    for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64}) {
+      setOperationAction(ISD::ROTL, VT, Custom);
+      setOperationAction(ISD::ROTR, VT, Custom);
     }
 
     // Custom legalize 2x32 to get a little better code.
     setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
     setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
 
-    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
-                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
+    for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32,
+                    MVT::v8f32, MVT::v2f64, MVT::v4f64})
       setOperationAction(ISD::MSCATTER, VT, Custom);
 
     if (Subtarget.hasDQI()) {
@@ -2177,13 +2179,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
 
     if (Subtarget.hasCDI()) {
-      for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
-        setOperationAction(ISD::CTLZ,            VT, Legal);
+      for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64}) {
+        setOperationAction(ISD::CTLZ, VT, Legal);
       }
     } // Subtarget.hasCDI()
 
     if (Subtarget.hasVPOPCNTDQ()) {
-      for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
+      for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64})
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
 
@@ -2220,34 +2222,34 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // This block control legalization of v32i1/v64i1 which are available with
   // AVX512BW..
   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
-    addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
-    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
+    addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
+    addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
 
-    for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
-      setOperationAction(ISD::VSELECT,            VT, Expand);
-      setOperationAction(ISD::TRUNCATE,           VT, Custom);
-      setOperationAction(ISD::SETCC,              VT, Custom);
+    for (auto VT : {MVT::v32i1, MVT::v64i1}) {
+      setOperationAction(ISD::VSELECT, VT, Expand);
+      setOperationAction(ISD::TRUNCATE, VT, Custom);
+      setOperationAction(ISD::SETCC, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
-      setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::CTSELECT, VT, Custom);
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
     }
 
-    for (auto VT : { MVT::v16i1, MVT::v32i1 })
+    for (auto VT : {MVT::v16i1, MVT::v32i1})
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 
     // Extends from v32i1 masks to 256-bit vectors.
-    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
-    setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
 
     for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16,
                     MVT::v16f16, MVT::v8f16}) {
-      setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
+      setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
     }
 
@@ -2256,120 +2258,120 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
 
     if (Subtarget.hasBITALG()) {
-      for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
+      for (auto VT : {MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16})
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
-    auto setGroup = [&] (MVT VT) {
-      setOperationAction(ISD::FADD,               VT, Legal);
-      setOperationAction(ISD::STRICT_FADD,        VT, Legal);
-      setOperationAction(ISD::FSUB,               VT, Legal);
-      setOperationAction(ISD::STRICT_FSUB,        VT, Legal);
-      setOperationAction(ISD::FMUL,               VT, Legal);
-      setOperationAction(ISD::STRICT_FMUL,        VT, Legal);
-      setOperationAction(ISD::FDIV,               VT, Legal);
-      setOperationAction(ISD::STRICT_FDIV,        VT, Legal);
-      setOperationAction(ISD::FSQRT,              VT, Legal);
-      setOperationAction(ISD::STRICT_FSQRT,       VT, Legal);
-
-      setOperationAction(ISD::FFLOOR,             VT, Legal);
-      setOperationAction(ISD::STRICT_FFLOOR,      VT, Legal);
-      setOperationAction(ISD::FCEIL,              VT, Legal);
-      setOperationAction(ISD::STRICT_FCEIL,       VT, Legal);
-      setOperationAction(ISD::FTRUNC,             VT, Legal);
-      setOperationAction(ISD::STRICT_FTRUNC,      VT, Legal);
-      setOperationAction(ISD::FRINT,              VT, Legal);
-      setOperationAction(ISD::STRICT_FRINT,       VT, Legal);
-      setOperationAction(ISD::FNEARBYINT,         VT, Legal);
-      setOperationAction(ISD::STRICT_FNEARBYINT,  VT, Legal);
+    auto setGroup = [&](MVT VT) {
+      setOperationAction(ISD::FADD, VT, Legal);
+      setOperationAction(ISD::STRICT_FADD, VT, Legal);
+      setOperationAction(ISD::FSUB, VT, Legal);
+      setOperationAction(ISD::STRICT_FSUB, VT, Legal);
+      setOperationAction(ISD::FMUL, VT, Legal);
+      setOperationAction(ISD::STRICT_FMUL, VT, Legal);
+      setOperationAction(ISD::FDIV, VT, Legal);
+      setOperationAction(ISD::STRICT_FDIV, VT, Legal);
+      setOperationAction(ISD::FSQRT, VT, Legal);
+      setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
+
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
+      setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+      setOperationAction(ISD::FRINT, VT, Legal);
+      setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+      setOperationAction(ISD::FNEARBYINT, VT, Legal);
+      setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
       setOperationAction(ISD::FROUNDEVEN, VT, Legal);
       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
 
-      setOperationAction(ISD::FROUND,             VT, Custom);
+      setOperationAction(ISD::FROUND, VT, Custom);
 
-      setOperationAction(ISD::LOAD,               VT, Legal);
-      setOperationAction(ISD::STORE,              VT, Legal);
+      setOperationAction(ISD::LOAD, VT, Legal);
+      setOperationAction(ISD::STORE, VT, Legal);
 
-      setOperationAction(ISD::FMA,                VT, Legal);
-      setOperationAction(ISD::STRICT_FMA,         VT, Legal);
-      setOperationAction(ISD::VSELECT,            VT, Legal);
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::FMA, VT, Legal);
+      setOperationAction(ISD::STRICT_FMA, VT, Legal);
+      setOperationAction(ISD::VSELECT, VT, Legal);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::CTSELECT, VT, Custom);
 
-      setOperationAction(ISD::FNEG,               VT, Custom);
-      setOperationAction(ISD::FABS,               VT, Custom);
-      setOperationAction(ISD::FCOPYSIGN,          VT, Custom);
+      setOperationAction(ISD::FNEG, VT, Custom);
+      setOperationAction(ISD::FABS, VT, Custom);
+      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 
-      setOperationAction(ISD::SETCC,              VT, Custom);
-      setOperationAction(ISD::STRICT_FSETCC,      VT, Custom);
-      setOperationAction(ISD::STRICT_FSETCCS,     VT, Custom);
+      setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+      setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
     };
 
     // AVX512_FP16 scalar operations
     setGroup(MVT::f16);
-    setOperationAction(ISD::FREM,                 MVT::f16, Promote);
-    setOperationAction(ISD::STRICT_FREM,          MVT::f16, Promote);
-    setOperationAction(ISD::SELECT_CC,            MVT::f16, Expand);
-    setOperationAction(ISD::BR_CC,                MVT::f16, Expand);
-    setOperationAction(ISD::STRICT_FROUND,        MVT::f16, Promote);
-    setOperationAction(ISD::FROUNDEVEN,           MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_FROUNDEVEN,    MVT::f16, Legal);
-    setOperationAction(ISD::FP_ROUND,             MVT::f16, Custom);
-    setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);
-    setOperationAction(ISD::FMAXIMUM,             MVT::f16, Custom);
-    setOperationAction(ISD::FMINIMUM,             MVT::f16, Custom);
-    setOperationAction(ISD::FMAXIMUMNUM,          MVT::f16, Custom);
-    setOperationAction(ISD::FMINIMUMNUM,          MVT::f16, Custom);
-    setOperationAction(ISD::FP_EXTEND,            MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);
-    setOperationAction(ISD::LRINT,                MVT::f16, Legal);
-    setOperationAction(ISD::LLRINT,               MVT::f16, Legal);
+    setOperationAction(ISD::FREM, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
+    setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+    setOperationAction(ISD::BR_CC, MVT::f16, Expand);
+    setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
+    setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
+    setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
+    setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
+    setOperationAction(ISD::LRINT, MVT::f16, Legal);
+    setOperationAction(ISD::LLRINT, MVT::f16, Legal);
 
     setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
     setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
 
     if (Subtarget.useAVX512Regs()) {
       setGroup(MVT::v32f16);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,       MVT::v32f16, Custom);
-      setOperationAction(ISD::SINT_TO_FP,             MVT::v32i16, Legal);
-      setOperationAction(ISD::STRICT_SINT_TO_FP,      MVT::v32i16, Legal);
-      setOperationAction(ISD::UINT_TO_FP,             MVT::v32i16, Legal);
-      setOperationAction(ISD::STRICT_UINT_TO_FP,      MVT::v32i16, Legal);
-      setOperationAction(ISD::FP_ROUND,               MVT::v16f16, Legal);
-      setOperationAction(ISD::STRICT_FP_ROUND,        MVT::v16f16, Legal);
-      setOperationAction(ISD::FP_EXTEND,              MVT::v16f32, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v16f32, Legal);
-      setOperationAction(ISD::FP_EXTEND,              MVT::v8f64,  Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v8f64,  Legal);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);
-
-      setOperationAction(ISD::FP_TO_SINT,             MVT::v32i16, Custom);
-      setOperationAction(ISD::STRICT_FP_TO_SINT,      MVT::v32i16, Custom);
-      setOperationAction(ISD::FP_TO_UINT,             MVT::v32i16, Custom);
-      setOperationAction(ISD::STRICT_FP_TO_UINT,      MVT::v32i16, Custom);
-      setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i8,  MVT::v32i16);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
+      setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
+      setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
+      setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
+
+      setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
+      setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
                                  MVT::v32i16);
-      setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i8,  MVT::v32i16);
+      setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
                                  MVT::v32i16);
-      setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i1,  MVT::v32i16);
+      setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
                                  MVT::v32i16);
-      setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i1,  MVT::v32i16);
+      setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
                                  MVT::v32i16);
 
-      setOperationAction(ISD::EXTRACT_SUBVECTOR,      MVT::v16f16, Legal);
-      setOperationAction(ISD::INSERT_SUBVECTOR,       MVT::v32f16, Legal);
-      setOperationAction(ISD::CONCAT_VECTORS,         MVT::v32f16, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
+      setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
 
-      setLoadExtAction(ISD::EXTLOAD, MVT::v8f64,  MVT::v8f16,  Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
       setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
 
       setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
@@ -2380,40 +2382,40 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::LLRINT, MVT::v8f16, Legal);
     }
 
-    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v8i16, Custom);
-    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
 
     if (Subtarget.hasVLX()) {
       setGroup(MVT::v8f16);
       setGroup(MVT::v16f16);
 
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8f16,  Legal);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16f16, Custom);
-      setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Legal);
-      setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v16i16, Legal);
-      setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16,  Legal);
-      setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i16,  Legal);
-      setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Legal);
-      setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v16i16, Legal);
-      setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16,  Legal);
-      setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v8i16,  Legal);
-
-      setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Legal);
-      setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);
-      setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v8f32, Legal);
-      setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
+
+      setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
+      setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
+      setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
 
       // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
-      setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v8f16,  Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v16f16, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
 
-      setOperationAction(ISD::EXTRACT_SUBVECTOR,    MVT::v8f16, Legal);
-      setOperationAction(ISD::INSERT_SUBVECTOR,     MVT::v16f16, Legal);
-      setOperationAction(ISD::CONCAT_VECTORS,       MVT::v16f16, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
+      setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
 
       setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
       setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
@@ -2421,7 +2423,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
 
       // Need to custom widen these to prevent scalarization.
-      setOperationAction(ISD::LOAD,  MVT::v4f16, Custom);
+      setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
       setOperationAction(ISD::STORE, MVT::v4f16, Custom);
 
       setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
@@ -2514,52 +2516,52 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
-    setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
+    setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
-    setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
+    setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
     setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
 
-    setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
+    setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
-    setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
+    setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
     setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
 
     if (Subtarget.hasBWI()) {
-      setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
-      setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
+      setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
+      setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
     }
 
     if (Subtarget.hasFP16()) {
       // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
-      setOperationAction(ISD::FP_TO_SINT,        MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
-      setOperationAction(ISD::FP_TO_UINT,        MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
-      setOperationAction(ISD::FP_TO_SINT,        MVT::v4f16, Custom);
+      setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
-      setOperationAction(ISD::FP_TO_UINT,        MVT::v4f16, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
       // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
-      setOperationAction(ISD::SINT_TO_FP,        MVT::v2f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
-      setOperationAction(ISD::UINT_TO_FP,        MVT::v2f16, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
-      setOperationAction(ISD::SINT_TO_FP,        MVT::v4f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
-      setOperationAction(ISD::UINT_TO_FP,        MVT::v4f16, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
       // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
-      setOperationAction(ISD::FP_ROUND,          MVT::v2f16, Custom);
-      setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v2f16, Custom);
-      setOperationAction(ISD::FP_ROUND,          MVT::v4f16, Custom);
-      setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v4f16, Custom);
+      setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
       // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
-      setOperationAction(ISD::FP_EXTEND,         MVT::v2f16, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v2f16, Custom);
-      setOperationAction(ISD::FP_EXTEND,         MVT::v4f16, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v4f16, Custom);
+      setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
     }
   }
 
@@ -2597,7 +2599,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // FIXME: We really should do custom legalization for addition and
   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   // than generic legalization for 64-bit multiplication-with-overflow, though.
-  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+  for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     // Add/Sub/Mul with overflow operations are custom lowered.
@@ -2881,8 +2883,9 @@ static bool isLogicOp(unsigned Opcode) {
 }
 
 static bool isTargetShuffle(unsigned Opcode) {
-  switch(Opcode) {
-  default: return false;
+  switch (Opcode) {
+  default:
+    return false;
   case X86ISD::BLENDI:
   case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
@@ -2923,7 +2926,8 @@ static bool isTargetShuffle(unsigned Opcode) {
 
 static bool isTargetShuffleVariableMask(unsigned Opcode) {
   switch (Opcode) {
-  default: return false;
+  default:
+    return false;
   // Target Shuffles.
   case X86ISD::PSHUFB:
   case X86ISD::VPERMILPV:
@@ -2949,9 +2953,8 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   if (ReturnAddrIndex == 0) {
     // Set up a frame object for the return address.
     unsigned SlotSize = RegInfo->getSlotSize();
-    ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
-                                                          -(int64_t)SlotSize,
-                                                          false);
+    ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(
+        SlotSize, -(int64_t)SlotSize, false);
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
 
@@ -3009,7 +3012,7 @@ static bool isX86CCSigned(X86::CondCode X86CC) {
 
 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
   switch (SetCCOpcode) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Invalid integer condition!");
   case ISD::SETEQ:  return X86::COND_E;
   case ISD::SETGT:  return X86::COND_G;
@@ -3021,7 +3024,7 @@ static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
   case ISD::SETUGT: return X86::COND_A;
   case ISD::SETULE: return X86::COND_BE;
   case ISD::SETUGE: return X86::COND_AE;
-  // clang-format on
+    // clang-format on
   }
 }
 
@@ -3059,14 +3062,14 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
   // First determine if it is required or is profitable to flip the operands.
 
   // If LHS is a foldable load, but RHS is not, flip the condition.
-  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
-      !ISD::isNON_EXTLoad(RHS.getNode())) {
+  if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) {
     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
     std::swap(LHS, RHS);
   }
 
   switch (SetCCOpcode) {
-  default: break;
+  default:
+    break;
   case ISD::SETOLT:
   case ISD::SETOLE:
   case ISD::SETUGT:
@@ -3082,7 +3085,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
   //  1 | 0 | 0 | X == Y
   //  1 | 1 | 1 | unordered
   switch (SetCCOpcode) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Condcode should be pre-legalized away");
   case ISD::SETUEQ:
   case ISD::SETEQ:   return X86::COND_E;
@@ -3104,7 +3107,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
   case ISD::SETO:    return X86::COND_NP;
   case ISD::SETOEQ:
   case ISD::SETUNE:  return X86::COND_INVALID;
-  // clang-format on
+    // clang-format on
   }
 }
 
@@ -3139,7 +3142,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   Info.flags = MachineMemOperand::MONone;
   Info.offset = 0;
 
-  const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
+  const IntrinsicData *IntrData = getIntrinsicWithChain(Intrinsic);
   if (!IntrData) {
     switch (Intrinsic) {
     case Intrinsic::x86_aesenc128kl:
@@ -3232,7 +3235,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case TRUNCATE_TO_MEM_VI32: {
     Info.opc = ISD::INTRINSIC_VOID;
     Info.ptrVal = I.getArgOperand(0);
-    MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
+    MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
       ScalarVT = MVT::i8;
@@ -3252,8 +3255,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = nullptr;
     MVT DataVT = MVT::getVT(I.getType());
     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
-    unsigned NumElts = std::min(DataVT.getVectorNumElements(),
-                                IndexVT.getVectorNumElements());
+    unsigned NumElts =
+        std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements());
     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
     Info.align = Align(1);
     Info.flags |= MachineMemOperand::MOLoad;
@@ -3264,8 +3267,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = nullptr;
     MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
-    unsigned NumElts = std::min(DataVT.getVectorNumElements(),
-                                IndexVT.getVectorNumElements());
+    unsigned NumElts =
+        std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements());
     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
     Info.align = Align(1);
     Info.flags |= MachineMemOperand::MOStore;
@@ -3424,8 +3427,9 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
   // Mask vectors support all subregister combinations and operations that
   // extract half of vector.
   if (ResVT.getVectorElementType() == MVT::i1)
-    return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
-                          (Index == ResVT.getVectorNumElements()));
+    return Index == 0 ||
+           ((ResVT.getSizeInBits() == SrcVT.getSizeInBits() * 2) &&
+            (Index == ResVT.getVectorNumElements()));
 
   return (Index % ResVT.getVectorNumElements()) == 0;
 }
@@ -3485,9 +3489,9 @@ bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
          (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
 }
 
-bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
-                                                const SelectionDAG &DAG,
-                                                const MachineMemOperand &MMO) const {
+bool X86TargetLowering::isLoadBitCastBeneficial(
+    EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG,
+    const MachineMemOperand &MMO) const {
   if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
       BitcastVT.getVectorElementType() == MVT::i1)
     return false;
@@ -3496,8 +3500,8 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
     return false;
 
   // If both types are legal vectors, it's always ok to convert them.
-  if (LoadVT.isVector() && BitcastVT.isVector() &&
-      isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
+  if (LoadVT.isVector() && BitcastVT.isVector() && isTypeLegal(LoadVT) &&
+      isTypeLegal(BitcastVT))
     return true;
 
   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
@@ -3521,9 +3525,7 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
   return true;
 }
 
-bool X86TargetLowering::isCtlzFast() const {
-  return Subtarget.hasFastLZCNT();
-}
+bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); }
 
 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
     const Instruction &AndI) const {
@@ -3952,8 +3954,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
   return true;
 }
 
-static bool canWidenShuffleElements(ArrayRef<int> Mask,
-                                    const APInt &Zeroable,
+static bool canWidenShuffleElements(ArrayRef<int> Mask, const APInt &Zeroable,
                                     bool V2IsZero,
                                     SmallVectorImpl<int> &WidenedMask) {
   // Create an alternative mask with info about zeroable elements.
@@ -4037,7 +4038,7 @@ bool X86::isZeroNode(SDValue Elt) {
 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
                               const SDLoc &dl, bool IsMask = false) {
 
-  SmallVector<SDValue, 32>  Ops;
+  SmallVector<SDValue, 32> Ops;
   bool Split = false;
 
   MVT ConstVecVT = VT;
@@ -4051,12 +4052,12 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
   MVT EltVT = ConstVecVT.getVectorElementType();
   for (unsigned i = 0; i < NumElts; ++i) {
     bool IsUndef = Values[i] < 0 && IsMask;
-    SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
-      DAG.getConstant(Values[i], dl, EltVT);
+    SDValue OpNode =
+        IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(Values[i], dl, EltVT);
     Ops.push_back(OpNode);
     if (Split)
-      Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
-                    DAG.getConstant(0, dl, EltVT));
+      Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT)
+                            : DAG.getConstant(0, dl, EltVT));
   }
   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
   if (Split)
@@ -4064,8 +4065,8 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
   return ConstsNode;
 }
 
-static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
-                              MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
+static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs, MVT VT,
+                              SelectionDAG &DAG, const SDLoc &dl) {
   assert(Bits.size() == Undefs.getBitWidth() &&
          "Unequal constant and undef arrays");
   SmallVector<SDValue, 32> Ops;
@@ -4100,8 +4101,8 @@ static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
   return DAG.getBitcast(VT, ConstsNode);
 }
 
-static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT,
-                              SelectionDAG &DAG, const SDLoc &dl) {
+static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT, SelectionDAG &DAG,
+                              const SDLoc &dl) {
   APInt Undefs = APInt::getZero(Bits.size());
   return getConstVector(Bits, Undefs, VT, DAG, dl);
 }
@@ -4638,8 +4639,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
     // May need to promote to a legal type.
     Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                     DAG.getConstant(0, dl, WideOpVT),
-                     SubVec, Idx);
+                     DAG.getConstant(0, dl, WideOpVT), SubVec, Idx);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   }
 
@@ -4654,20 +4654,18 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   if (IdxVal == 0) {
     // Zero lower bits of the Vec
     SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
-    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
-                      ZeroIdx);
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
     // Merge them together, SubVec should be zero extended.
     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                         DAG.getConstant(0, dl, WideOpVT),
-                         SubVec, ZeroIdx);
+                         DAG.getConstant(0, dl, WideOpVT), SubVec, ZeroIdx);
     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   }
 
-  SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                       Undef, SubVec, ZeroIdx);
+  SubVec =
+      DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, SubVec, ZeroIdx);
 
   if (Vec.isUndef()) {
     assert(IdxVal != 0 && "Unexpected index");
@@ -4705,12 +4703,11 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
       // isel to optimize when bits are known zero.
       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                        DAG.getConstant(0, dl, WideOpVT),
-                        Vec, ZeroIdx);
+                        DAG.getConstant(0, dl, WideOpVT), Vec, ZeroIdx);
     } else {
       // Otherwise use explicit shifts to zero the bits.
-      Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                        Undef, Vec, ZeroIdx);
+      Vec =
+          DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
       NumElems = WideOpVT.getVectorNumElements();
       SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
       Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
@@ -4763,9 +4760,9 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   // Isolate the bits after the last inserted bit.
   unsigned HighShift = IdxVal + SubVecNumElems;
   SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
-                            DAG.getTargetConstant(HighShift, dl, MVT::i8));
+                             DAG.getTargetConstant(HighShift, dl, MVT::i8));
   High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
-                    DAG.getTargetConstant(HighShift, dl, MVT::i8));
+                     DAG.getTargetConstant(HighShift, dl, MVT::i8));
 
   // Now OR all 3 pieces together.
   Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
@@ -4846,8 +4843,8 @@ static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
   return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
 }
 
-void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
-                                   bool Lo, bool Unary) {
+void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+                                   bool Unary) {
   assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
          "Illegal vector type to unpack");
   assert(Mask.empty() && "Expected an empty shuffle mask vector");
@@ -4984,13 +4981,12 @@ static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
 /// This produces a shuffle where the low element of V2 is swizzled into the
 /// zero/undef vector, landing at element Idx.
 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
-static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
-                                           bool IsZero,
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
   MVT VT = V2.getSimpleValueType();
-  SDValue V1 = IsZero
-    ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
+  SDValue V1 =
+      IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   int NumElems = VT.getVectorNumElements();
   SmallVector<int, 16> MaskVec(NumElems);
   for (int i = 0; i != NumElems; ++i)
@@ -8568,7 +8564,7 @@ static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
         case ISD::FADD: HOpcode = X86ISD::FHADD; break;
         case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
         default: return false;
-        // clang-format on
+          // clang-format on
         }
       }
 
@@ -8598,8 +8594,7 @@ static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
       // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
       unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
       unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
-      unsigned ExpectedIndex = i * NumEltsIn128Bits +
-                               (j % NumEltsIn64Bits) * 2;
+      unsigned ExpectedIndex = i * NumEltsIn128Bits + (j % NumEltsIn64Bits) * 2;
       if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
         continue;
 
@@ -9249,8 +9244,8 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL,
   return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
 }
 
-SDValue
-X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                             SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   MVT VT = Op.getSimpleValueType();
@@ -9474,14 +9469,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     }
 
     // Is it a vector logical left shift?
-    if (NumElems == 2 && Idx == 1 &&
-        X86::isZeroNode(Op.getOperand(0)) &&
+    if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) &&
         !X86::isZeroNode(Op.getOperand(1))) {
       unsigned NumBits = VT.getSizeInBits();
-      return getVShift(true, VT,
-                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                   VT, Op.getOperand(1)),
-                       NumBits/2, DAG, *this, dl);
+      return getVShift(
+          true, VT,
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)),
+          NumBits / 2, DAG, *this, dl);
     }
 
     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
@@ -9494,7 +9488,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     // place.
     if (EVTBits == 32) {
       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
-      return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
+      return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget,
+                                         DAG);
     }
   }
 
@@ -9533,8 +9528,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   // build_vector and broadcast it.
   // TODO: We could probably generalize this more.
   if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
-    SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
-                       DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
+    SDValue Ops[4] = {Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT),
+                      DAG.getUNDEF(EltVT)};
     auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
       // Make sure all the even/odd operands match.
       for (unsigned i = 2; i != NumElems; ++i)
@@ -9550,8 +9545,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
                                      DAG.getBuildVector(NarrowVT, dl, Ops));
       // Broadcast from v2i64/v2f64 and cast to final VT.
       MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
-      return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
-                                            NewBV));
+      return DAG.getBitcast(
+          VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, NewBV));
     }
   }
 
@@ -9564,7 +9559,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     SDValue Lower =
         DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
     SDValue Upper = DAG.getBuildVector(
-        HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
+        HVT, dl, Op->ops().slice(NumElems / 2, NumElems / 2));
 
     // Recreate the wider vector with the lower and upper part.
     return concatSubVectors(Lower, Upper, DAG, dl);
@@ -9575,8 +9570,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     if (NumNonZero == 1) {
       // One half is zero or undef.
       unsigned Idx = NonZeroMask.countr_zero();
-      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
-                               Op.getOperand(Idx));
+      SDValue V2 =
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx));
       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
     }
     return SDValue();
@@ -9611,30 +9606,28 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
     for (unsigned i = 0; i < 2; ++i) {
       switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
-        default: llvm_unreachable("Unexpected NonZero count");
-        case 0:
-          Ops[i] = Ops[i*2];  // Must be a zero vector.
-          break;
-        case 1:
-          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
-          break;
-        case 2:
-          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
-          break;
-        case 3:
-          Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
-          break;
+      default:
+        llvm_unreachable("Unexpected NonZero count");
+      case 0:
+        Ops[i] = Ops[i * 2]; // Must be a zero vector.
+        break;
+      case 1:
+        Ops[i] = getMOVL(DAG, dl, VT, Ops[i * 2 + 1], Ops[i * 2]);
+        break;
+      case 2:
+        Ops[i] = getMOVL(DAG, dl, VT, Ops[i * 2], Ops[i * 2 + 1]);
+        break;
+      case 3:
+        Ops[i] = getUnpackl(DAG, dl, VT, Ops[i * 2], Ops[i * 2 + 1]);
+        break;
       }
     }
 
     bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
     bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
-    int MaskVec[] = {
-      Reverse1 ? 1 : 0,
-      Reverse1 ? 0 : 1,
-      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
-      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
-    };
+    int MaskVec[] = {Reverse1 ? 1 : 0, Reverse1 ? 0 : 1,
+                     static_cast<int>(Reverse2 ? NumElems + 1 : NumElems),
+                     static_cast<int>(Reverse2 ? NumElems : NumElems + 1)};
     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
   }
 
@@ -9653,7 +9646,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       Result = DAG.getUNDEF(VT);
 
     for (unsigned i = 1; i < NumElems; ++i) {
-      if (Op.getOperand(i).isUndef()) continue;
+      if (Op.getOperand(i).isUndef())
+        continue;
       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
                            Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
     }
@@ -9678,14 +9672,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
     // Generate scaled UNPCKL shuffle mask.
     SmallVector<int, 16> Mask;
-    for(unsigned i = 0; i != Scale; ++i)
+    for (unsigned i = 0; i != Scale; ++i)
       Mask.push_back(i);
     for (unsigned i = 0; i != Scale; ++i)
-      Mask.push_back(NumElems+i);
+      Mask.push_back(NumElems + i);
     Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
 
     for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
-      Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
+      Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2 * i], Ops[(2 * i) + 1], Mask);
   }
   return Ops[0];
 }
@@ -9711,15 +9705,14 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl,
     if (SubVec.isUndef())
       continue;
     if (ISD::isFreezeUndef(SubVec.getNode())) {
-        // If the freeze(undef) has multiple uses then we must fold to zero.
-        if (SubVec.hasOneUse()) {
-          ++NumFreezeUndef;
-        } else {
-          ++NumZero;
-          Undefs.insert(SubVec);
-        }
-    }
-    else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+      // If the freeze(undef) has multiple uses then we must fold to zero.
+      if (SubVec.hasOneUse()) {
+        ++NumFreezeUndef;
+      } else {
+        ++NumZero;
+        Undefs.insert(SubVec);
+      }
+    } else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
       ++NumZero;
     else {
       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
@@ -9733,9 +9726,9 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl,
     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
     ArrayRef<SDUse> Ops = Op->ops();
     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
-                             Ops.slice(0, NumOperands/2));
+                             Ops.slice(0, NumOperands / 2));
     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
-                             Ops.slice(NumOperands/2));
+                             Ops.slice(NumOperands / 2));
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   }
 
@@ -9768,7 +9761,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl,
 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl,
                                        const X86Subtarget &Subtarget,
-                                       SelectionDAG & DAG) {
+                                       SelectionDAG &DAG) {
   MVT ResVT = Op.getSimpleValueType();
   unsigned NumOperands = Op.getNumOperands();
   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
@@ -9839,8 +9832,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl,
                      DAG.getVectorIdxConstant(NumElems / 2, dl));
 }
 
-static SDValue LowerCONCAT_VECTORS(SDValue Op,
-                                   const X86Subtarget &Subtarget,
+static SDValue LowerCONCAT_VECTORS(SDValue Op, const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
@@ -10062,8 +10054,8 @@ static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
 
     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
     // Adjust second vector indices to start at LaneSize instead of Size.
-    int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
-                                : Mask[i] % LaneSize + LaneSize;
+    int LocalM =
+        Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
     if (RepeatedMask[i % LaneSize] < 0)
       // This is the first non-undef entry in this slot of a 128-bit lane.
       RepeatedMask[i % LaneSize] = LocalM;
@@ -10081,8 +10073,7 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
 }
 
-static bool
-is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
+static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
   SmallVector<int, 32> RepeatedMask;
   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
 }
@@ -10381,8 +10372,8 @@ static SDValue getSHUFPDImmForMask(ArrayRef<int> Mask, const SDLoc &DL,
 //
 // The function looks for a sub-mask that the nonzero elements are in
 // increasing order. If such sub-mask exist. The function returns true.
-static bool isNonZeroElementsInOrder(const APInt &Zeroable,
-                                     ArrayRef<int> Mask, const EVT &VectorType,
+static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef<int> Mask,
+                                     const EVT &VectorType,
                                      bool &IsZeroSideLeft) {
   int NextElement = -1;
   // Check if the Mask's nonzero elements are in increasing order.
@@ -11162,7 +11153,7 @@ static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,
       if (M == SM_SentinelUndef)
         continue;
       if (M == Elt || (0 <= M && M < NumElts &&
-                     IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
+                       IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
         Mask[Elt] = Elt;
         LaneV1InUse = true;
         continue;
@@ -11295,8 +11286,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
 
     // If we have VPTERNLOG, we can use that as a bit blend.
     if (Subtarget.hasVLX())
-      if (SDValue BitBlend =
-              lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+      if (SDValue BitBlend = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
         return BitBlend;
 
     // Scale the blend by the number of bytes per element.
@@ -11604,9 +11594,11 @@ static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
 
 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
 /// permuting the elements of the result in place.
-static SDValue lowerShuffleAsByteRotateAndPermute(
-    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT,
+                                                  SDValue V1, SDValue V2,
+                                                  ArrayRef<int> Mask,
+                                                  const X86Subtarget &Subtarget,
+                                                  SelectionDAG &DAG) {
   if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
       (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
       (VT.is512BitVector() && !Subtarget.hasBWI()))
@@ -11804,9 +11796,9 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
     // If either input vector provides only a single element which is repeated
     // multiple times, unpacking from both input vectors would generate worse
     // code. e.g. for
-    // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
-    // it is better to process t4 first to create a vector of t4[0], then unpack
-    // that vector with t2.
+    // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2,
+    // t4 it is better to process t4 first to create a vector of t4[0], then
+    // unpack that vector with t2.
     if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
         !isSingleElementRepeatedMask(V2Mask))
       if (SDValue UnpackPerm =
@@ -11818,8 +11810,8 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
       return RotatePerm;
 
     // Unpack/rotate failed - try again with variable blends.
-    if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
-                                                          DAG))
+    if (SDValue BlendPerm =
+            lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
       return BlendPerm;
 
     if (VT.getScalarSizeInBits() >= 32)
@@ -11933,7 +11925,7 @@ static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
   SDValue Lo, Hi;
   for (int i = 0; i < NumElts; ++i) {
     int M = Mask[i];
-    assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
+    assert((M == SM_SentinelUndef || (0 <= M && M < (2 * NumElts))) &&
            "Unexpected mask index.");
     if (M < 0)
       continue;
@@ -12055,8 +12047,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
          "Rotate-based lowering only supports 128-bit lowering!");
   assert(Mask.size() <= 16 &&
          "Can shuffle at most 16 bytes in a 128-bit vector!");
-  assert(ByteVT == MVT::v16i8 &&
-         "SSE2 rotate lowering only needed for v16i8!");
+  assert(ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!");
 
   // Default SSE2 implementation
   int LoByteShift = 16 - ByteRotation;
@@ -12091,8 +12082,9 @@ static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
          "Only 32-bit and 64-bit elements are supported!");
 
   // 128/256-bit vectors are only supported with VLX.
-  assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
-         && "VLX required for 128/256-bit vectors");
+  assert(
+      (Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) &&
+      "VLX required for 128/256-bit vectors");
 
   SDValue Lo = V1, Hi = V2;
   int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
@@ -12644,8 +12636,7 @@ static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT,
 /// are both incredibly common and often quite performance sensitive.
 static SDValue lowerShuffleAsZeroOrAnyExtend(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const APInt &Zeroable, const X86Subtarget &Subtarget,
-    SelectionDAG &DAG) {
+    const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   int Bits = VT.getSizeInBits();
   int NumLanes = Bits / 128;
   int NumElements = VT.getVectorNumElements();
@@ -12771,7 +12762,8 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
   // If the bitcasts shift the element size, we can't extract an equivalent
   // element from it.
   MVT NewVT = V.getSimpleValueType();
-  if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+  if (!NewVT.isVector() ||
+      NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
     return SDValue();
 
   if (V.getOpcode() == ISD::BUILD_VECTOR ||
@@ -12795,7 +12787,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
          ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
 }
 
-template<typename T>
+template <typename T>
 static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
   T EltVT = VT.getScalarType();
   return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
@@ -12808,8 +12800,7 @@ static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
 /// across all subtarget feature sets.
 static SDValue lowerShuffleAsElementInsertion(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const APInt &Zeroable, const X86Subtarget &Subtarget,
-    SelectionDAG &DAG) {
+    const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   MVT ExtVT = VT;
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
@@ -12842,8 +12833,8 @@ static SDValue lowerShuffleAsElementInsertion(
   // all the smarts here sunk into that routine. However, the current
   // lowering of BUILD_VECTOR makes that nearly impossible until the old
   // vector shuffle lowering is dead.
-  SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
-                                               DAG);
+  SDValue V2S =
+      getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), DAG);
   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
     // We need to zext the scalar if it is smaller than an i32.
     V2S = DAG.getBitcast(EltVT, V2S);
@@ -13046,8 +13037,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
   // Check that both sources are extracts of the same source vector.
   if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
       N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
-      N0.getOperand(0) != N1.getOperand(0) ||
-      !N0.hasOneUse() || !N1.hasOneUse())
+      N0.getOperand(0) != N1.getOperand(0) || !N0.hasOneUse() ||
+      !N1.hasOneUse())
     return SDValue();
 
   SDValue WideVec = N0.getOperand(0);
@@ -13077,8 +13068,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
   NewMask.append(NumElts, -1);
 
   // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
-  SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
-                                      NewMask);
+  SDValue Shuf =
+      DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), NewMask);
   // This is free: ymm -> xmm.
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
                      DAG.getVectorIdxConstant(0, DL));
@@ -13277,8 +13268,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
   if (!V.getValueType().isVector()) {
     assert(V.getScalarValueSizeInBits() == NumEltBits &&
            "Unexpected scalar size");
-    MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
-                                       VT.getVectorNumElements());
+    MVT BroadcastVT =
+        MVT::getVectorVT(V.getSimpleValueType(), VT.getVectorNumElements());
     return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
   }
 
@@ -13303,8 +13294,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
 // elements are zeroable.
 static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
                                    unsigned &InsertPSMask,
-                                   const APInt &Zeroable,
-                                   ArrayRef<int> Mask, SelectionDAG &DAG) {
+                                   const APInt &Zeroable, ArrayRef<int> Mask,
+                                   SelectionDAG &DAG) {
   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
@@ -13756,8 +13747,8 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // when the V2 input is targeting element 0 of the mask -- that is the fast
   // case here.
   if (NumV2Elements == 1 && Mask[0] >= 4)
-    if (SDValue V = lowerShuffleAsElementInsertion(
-            DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask,
+                                                   Zeroable, Subtarget, DAG))
       return V;
 
   if (Subtarget.hasSSE41()) {
@@ -13766,8 +13757,8 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       return V;
 
     if (!isSingleSHUFPSMask(Mask))
-      if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
-                                                            V2, Mask, DAG))
+      if (SDValue BlendPerm =
+              lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG))
         return BlendPerm;
   }
 
@@ -13859,8 +13850,8 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Elements == 1)
-    if (SDValue V = lowerShuffleAsElementInsertion(
-            DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, Mask,
+                                                   Zeroable, Subtarget, DAG))
       return V;
 
   // We have different paths for blend lowering, but they all must use the
@@ -13990,7 +13981,7 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
   };
 
   if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
-    int PSHUFDMask[4] = { -1, -1, -1, -1 };
+    int PSHUFDMask[4] = {-1, -1, -1, -1};
     SmallVector<std::pair<int, int>, 4> DWordPairs;
     int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
 
@@ -14094,7 +14085,8 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
     int TripleNonInputIdx =
-        TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
+        TripleInputSum -
+        std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
     TripleDWord = TripleNonInputIdx / 2;
 
     // We use xor with one to compute the adjacent DWord to whichever one the
@@ -14172,9 +14164,9 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
 
     // Adjust the mask to match the new locations of A and B.
     for (int &M : Mask)
-      if (M >= 0 && M/2 == ADWord)
+      if (M >= 0 && M / 2 == ADWord)
         M = 2 * BDWord + M % 2;
-      else if (M >= 0 && M/2 == BDWord)
+      else if (M >= 0 && M / 2 == BDWord)
         M = 2 * ADWord + M % 2;
 
     // Recurse back into this routine to re-compute state now that this isn't
@@ -14202,33 +14194,33 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
                     MutableArrayRef<int> SourceHalfMask,
                     MutableArrayRef<int> HalfMask, int HalfOffset) {
-    if (InPlaceInputs.empty())
-      return;
-    if (InPlaceInputs.size() == 1) {
-      SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
-          InPlaceInputs[0] - HalfOffset;
-      PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
-      return;
-    }
-    if (IncomingInputs.empty()) {
-      // Just fix all of the in place inputs.
-      for (int Input : InPlaceInputs) {
-        SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
-        PSHUFDMask[Input / 2] = Input / 2;
-      }
-      return;
-    }
+        if (InPlaceInputs.empty())
+          return;
+        if (InPlaceInputs.size() == 1) {
+          SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+              InPlaceInputs[0] - HalfOffset;
+          PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
+          return;
+        }
+        if (IncomingInputs.empty()) {
+          // Just fix all of the in place inputs.
+          for (int Input : InPlaceInputs) {
+            SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+            PSHUFDMask[Input / 2] = Input / 2;
+          }
+          return;
+        }
 
-    assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
-    SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
-        InPlaceInputs[0] - HalfOffset;
-    // Put the second input next to the first so that they are packed into
-    // a dword. We find the adjacent index by toggling the low bit.
-    int AdjIndex = InPlaceInputs[0] ^ 1;
-    SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
-    llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
-    PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
-  };
+        assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
+        SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+            InPlaceInputs[0] - HalfOffset;
+        // Put the second input next to the first so that they are packed into
+        // a dword. We find the adjacent index by toggling the low bit.
+        int AdjIndex = InPlaceInputs[0] ^ 1;
+        SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
+        llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
+        PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
+      };
   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
 
@@ -14237,10 +14229,12 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
   // FIXME: This operation could almost certainly be simplified dramatically to
   // look more like the 3-1 fixing operation.
   auto moveInputsToRightHalf = [&PSHUFDMask](
-      MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
-      MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
-      MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
-      int DestOffset) {
+                                   MutableArrayRef<int> IncomingInputs,
+                                   ArrayRef<int> ExistingInputs,
+                                   MutableArrayRef<int> SourceHalfMask,
+                                   MutableArrayRef<int> HalfMask,
+                                   MutableArrayRef<int> FinalSourceHalfMask,
+                                   int SourceOffset, int DestOffset) {
     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
     };
@@ -14436,9 +14430,11 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
 
 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
 /// blend if only one input is used.
-static SDValue lowerShuffleAsBlendOfPSHUFBs(
-    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
+static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1,
+                                            SDValue V2, ArrayRef<int> Mask,
+                                            const APInt &Zeroable,
+                                            SelectionDAG &DAG, bool &V1InUse,
+                                            bool &V2InUse) {
   assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
          "Lane crossing shuffle masks not supported");
 
@@ -14533,8 +14529,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       return Broadcast;
 
     // Try to use bit rotation instructions.
-    if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
-                                                 Subtarget, DAG))
+    if (SDValue Rotate =
+            lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask, Subtarget, DAG))
       return Rotate;
 
     // Use dedicated unpack instructions for masks that match their pattern.
@@ -14569,14 +14565,14 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // See if we can use SSE4A Extraction / Insertion.
   if (Subtarget.hasSSE4A())
-    if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
-                                          Zeroable, DAG))
+    if (SDValue V =
+            lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG))
       return V;
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Inputs == 1)
-    if (SDValue V = lowerShuffleAsElementInsertion(
-            DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, Mask,
+                                                   Zeroable, Subtarget, DAG))
       return V;
 
   // We have different paths for blend lowering, but they all must use the
@@ -14692,8 +14688,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // can both shuffle and set up the inefficient blend.
   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
     bool V1InUse, V2InUse;
-    return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
-                                        Zeroable, DAG, V1InUse, V2InUse);
+    return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+                                        DAG, V1InUse, V2InUse);
   }
 
   // We can always bit-blend if we have to so the fallback strategy is to
@@ -14826,8 +14822,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // See if we can use SSE4A Extraction / Insertion.
   if (Subtarget.hasSSE4A())
-    if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
-                                          Zeroable, DAG))
+    if (SDValue V =
+            lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG))
       return V;
 
   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
@@ -14840,8 +14836,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       return Broadcast;
 
     // Try to use bit rotation instructions.
-    if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
-                                                 Subtarget, DAG))
+    if (SDValue Rotate =
+            lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask, Subtarget, DAG))
       return Rotate;
 
     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
@@ -14882,7 +14878,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
       SmallDenseMap<int, int, 8> LaneMap;
       for (int I : InPlaceInputs) {
-        PreDupI16Shuffle[I/2] = I/2;
+        PreDupI16Shuffle[I / 2] = I / 2;
         LaneMap[I] = I;
       }
       int j = TargetLo ? 0 : 4, je = j + 4;
@@ -14896,7 +14892,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
             ++j;
 
           if (j == je)
-            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+            // We can't place the inputs into a single half with a simple i16
+            // shuffle, so bail.
             return SDValue();
 
           // Map this input with the i16 shuffle.
@@ -15017,8 +15014,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
       // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
       // PALIGNR will be cheaper than the second PSHUFB+OR.
-      if (SDValue V = lowerShuffleAsByteRotateAndPermute(
-              DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+      if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v16i8, V1, V2,
+                                                         Mask, Subtarget, DAG))
         return V;
     }
 
@@ -15027,8 +15024,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Elements == 1)
-    if (SDValue V = lowerShuffleAsElementInsertion(
-            DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2, Mask,
+                                                   Zeroable, Subtarget, DAG))
       return V;
 
   if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
@@ -15120,8 +15117,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       if (M >= 0)
         M /= 2;
   } else {
-    // Otherwise just unpack the low half of V into VLoHalf and the high half into
-    // VHiHalf so that we can blend them as i16s.
+    // Otherwise just unpack the low half of V into VLoHalf and the high half
+    // into VHiHalf so that we can blend them as i16s.
     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
 
     VLoHalf = DAG.getBitcast(
@@ -15130,8 +15127,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
   }
 
-  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
-  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
+  SDValue LoV =
+      DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+  SDValue HiV =
+      DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
 
   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
 }
@@ -15140,9 +15139,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 ///
 /// This routine breaks down the specific type of 128-bit shuffle and
 /// dispatches to the lowering routines accordingly.
-static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                  MVT VT, SDValue V1, SDValue V2,
-                                  const APInt &Zeroable,
+static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                  SDValue V1, SDValue V2, const APInt &Zeroable,
                                   const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG) {
   if (VT == MVT::v8bf16) {
@@ -15324,7 +15322,7 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
                                           const X86Subtarget &Subtarget,
                                           SelectionDAG &DAG) {
   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
-         "shuffles as it could then recurse on itself.");
+                          "shuffles as it could then recurse on itself.");
   int Size = Mask.size();
 
   // If this can be modeled as a broadcast of two elements followed by a blend,
@@ -15663,8 +15661,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
   // instruction bytes needed to explicitly generate the zero vector.
 
   // Blends are faster and handle all the non-lane-crossing cases.
-  if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
-                                          Subtarget, DAG))
+  if (SDValue Blend =
+          lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return Blend;
 
   // If either input operand is a zero vector, use VPERM2X128 because its mask
@@ -15690,8 +15688,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
     // Try to use SHUF128 if possible.
     if (Subtarget.hasVLX()) {
       if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
-        unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
-                            ((WidenedMask[1] % 2) << 1);
+        unsigned PermMask =
+            ((WidenedMask[0] % 2) << 0) | ((WidenedMask[1] % 2) << 1);
         return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
                            DAG.getTargetConstant(PermMask, DL, MVT::i8));
       }
@@ -15715,7 +15713,7 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
          (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
 
   unsigned PermMask = 0;
-  PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
+  PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
   PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
 
   // Check the immediate mask and replace unused sources with undef.
@@ -15907,9 +15905,9 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
 /// adjusted to access the extracted halves of the original shuffle operands is
 /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
 /// lower half of each input operand is accessed.
-static bool
-getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
-                   int &HalfIdx1, int &HalfIdx2) {
+static bool getHalfShuffleMask(ArrayRef<int> Mask,
+                               MutableArrayRef<int> HalfMask, int &HalfIdx1,
+                               int &HalfIdx2) {
   assert((Mask.size() == HalfMask.size() * 2) &&
          "Expected input mask to be twice as long as output");
 
@@ -15962,7 +15960,8 @@ getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
 static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
                                      ArrayRef<int> HalfMask, int HalfIdx1,
                                      int HalfIdx2, bool UndefLower,
-                                     SelectionDAG &DAG, bool UseConcat = false) {
+                                     SelectionDAG &DAG,
+                                     bool UseConcat = false) {
   assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
   assert(V1.getValueType().isSimple() && "Expecting only simple types");
 
@@ -16324,7 +16323,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
   assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
          "Illegal shuffle mask");
 
-  bool ZeroLane[2] = { true, true };
+  bool ZeroLane[2] = {true, true};
   for (int i = 0; i < NumElts; ++i)
     ZeroLane[i & 1] &= Zeroable[i];
 
@@ -16409,9 +16408,9 @@ static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
 
   // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
   // the upper bits of the result using an unpckldq.
-  SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
-                                        { 0, 1, 2, 3, 16, 17, 18, 19,
-                                          4, 5, 6, 7, 20, 21, 22, 23 });
+  SDValue Unpack = DAG.getVectorShuffle(
+      MVT::v16i8, DL, V1, V2,
+      {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23});
   // Insert the unpckldq into a zero vector to widen to v32i8.
   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
                      DAG.getConstant(0, DL, MVT::v32i8), Unpack,
@@ -16648,8 +16647,8 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Blend;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
-                                                  Subtarget, DAG))
+  if (SDValue Broadcast =
+          lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
   // Try to use shift instructions if fast.
@@ -16756,8 +16755,8 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Blend;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
-                                                  Subtarget, DAG))
+  if (SDValue Broadcast =
+          lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
   if (!Subtarget.hasAVX2()) {
@@ -16904,8 +16903,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Blend;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
-                                                  Subtarget, DAG))
+  if (SDValue Broadcast =
+          lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
   // Try to use shift instructions if fast.
@@ -17072,7 +17071,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
     // Try to produce a fixed cross-128-bit lane permute followed by unpack
     // because that should be faster than the variable permute alternatives.
-    if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
+    if (SDValue V =
+            lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
       return V;
 
     // There are no generalized cross-lane shuffle operations available on i16
@@ -17091,8 +17091,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       // As this is a single-input shuffle, the repeated mask should be
       // a strictly valid v8i16 mask that we can pass through to the v8i16
       // lowering to handle even the v16 case.
-      return lowerV8I16GeneralSingleInputShuffle(
-          DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+      return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v16i16, V1,
+                                                 RepeatedMask, Subtarget, DAG);
     }
   }
 
@@ -17111,8 +17111,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Result;
 
   // Try to permute the lanes and then use a per-lane permute.
-  if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
-          DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
+  if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v16i16, V1, V2,
+                                                      Mask, DAG, Subtarget))
     return V;
 
   // Try to match an interleave of two v16i16s and lower them as unpck and
@@ -17148,8 +17148,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return ZExt;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
-                                                  Subtarget, DAG))
+  if (SDValue Broadcast =
+          lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
@@ -17201,8 +17201,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
       return V;
 
-    if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
-            DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
+    if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v32i8, V1, V2,
+                                                        Mask, DAG, Subtarget))
       return V;
 
     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
@@ -17224,16 +17224,16 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Result;
 
   // Try to permute the lanes and then use a per-lane permute.
-  if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
-          DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
+  if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v32i8, V1, V2,
+                                                      Mask, DAG, Subtarget))
     return V;
 
   // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
   // by zeroable elements in the remaining 24 elements. Turn this into two
   // vmovqb instructions shuffled together.
   if (Subtarget.hasVLX())
-    if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
-                                                  Mask, Zeroable, DAG))
+    if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, Mask,
+                                                  Zeroable, DAG))
       return V;
 
   // Try to match an interleave of two v32i8s and lower them as unpck and
@@ -17288,7 +17288,8 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
         return V;
       if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
         return V;
-      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
+      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
+                                  /*SimpleOnly*/ false);
     }
 
     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
@@ -17537,8 +17538,7 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // If we have a single input shuffle with different shuffle patterns in the
   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
-  if (V2.isUndef() &&
-      !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
+  if (V2.isUndef() && !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
     SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
     return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
   }
@@ -17805,8 +17805,8 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
-  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
-          DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v64i8, V1, V2, Mask,
+                                                   Zeroable, Subtarget, DAG))
     return ZExt;
 
   // Use dedicated unpack instructions for masks that match their pattern.
@@ -17883,7 +17883,8 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (Subtarget.hasVBMI())
     return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
 
-  return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
+  return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG,
+                              /*SimpleOnly*/ false);
 }
 
 /// High-level routine to lower various 512-bit x86 vector shuffles.
@@ -17891,13 +17892,11 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine either breaks down the specific type of a 512-bit x86 vector
 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
 /// together based on the available instructions.
-static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                  MVT VT, SDValue V1, SDValue V2,
-                                  const APInt &Zeroable,
+static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                  SDValue V1, SDValue V2, const APInt &Zeroable,
                                   const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG) {
-  assert(Subtarget.hasAVX512() &&
-         "Cannot lower 512-bit vectors w/ basic ISA!");
+  assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!");
 
   // If we have a single input to the zero element, insert that into V1 if we
   // can do so cheaply.
@@ -17915,8 +17914,8 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return V;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
-                                                  Subtarget, DAG))
+  if (SDValue Broadcast =
+          lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
@@ -17928,7 +17927,8 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
       return V;
 
-    return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
+    return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
+                                /*SimpleOnly*/ false);
   }
 
   if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
@@ -18035,14 +18035,12 @@ static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
   return -1;
 }
 
-
 // Lower vXi1 vector shuffles.
 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
 // vector, shuffle and then truncate it back.
-static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                MVT VT, SDValue V1, SDValue V2,
-                                const APInt &Zeroable,
+static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                SDValue V1, SDValue V2, const APInt &Zeroable,
                                 const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   assert(Subtarget.hasAVX512() &&
@@ -18173,8 +18171,8 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   int NumElems = VT.getVectorNumElements();
   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
       (Subtarget.hasDQI() && (NumElems < 32)))
-    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
-                       Shuffle, ISD::SETGT);
+    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT), Shuffle,
+                        ISD::SETGT);
 
   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
 }
@@ -18301,7 +18299,7 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
     unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
     const X86Subtarget &Subtarget);
 
-    /// Top-level lowering for x86 vector shuffles.
+/// Top-level lowering for x86 vector shuffles.
 ///
 /// This handles decomposition, canonicalization, and lowering of all x86
 /// vector shuffles. Most of the specific lowering strategies are encapsulated
@@ -18377,8 +18375,8 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
     // by obfuscating the operands with bitcasts.
     // TODO: Avoid lowering directly from this top-level function: make this
     // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
-    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
-                                                    Subtarget, DAG))
+    if (SDValue Broadcast =
+            lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask, Subtarget, DAG))
       return Broadcast;
 
     MVT NewEltVT = VT.isFloatingPoint()
@@ -18601,8 +18599,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
     // Build a mask by testing the condition against zero.
     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
-                                DAG.getConstant(0, dl, CondVT),
-                                ISD::SETNE);
+                                DAG.getConstant(0, dl, CondVT), ISD::SETNE);
     // Now return a new VSELECT using the mask.
     return DAG.getSelect(dl, VT, Mask, LHS, RHS);
   }
@@ -18709,7 +18706,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   }
 
   if (VT == MVT::i32 || VT == MVT::i64)
-      return Op;
+    return Op;
 
   return SDValue();
 }
@@ -18722,7 +18719,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Vec);
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
-  auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
+  auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
   MVT EltVT = Op.getSimpleValueType();
 
   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
@@ -18737,7 +18734,8 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
     if (NumElts == 1) {
       Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
       MVT IntVT = MVT::getIntegerVT(Vec.getValueType().getVectorNumElements());
-      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                         DAG.getBitcast(IntVT, Vec));
     }
     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
@@ -18795,14 +18793,13 @@ static APInt getExtractedDemandedElts(SDNode *N) {
   return DemandedElts;
 }
 
-SDValue
-X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
-                                           SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                   SelectionDAG &DAG) const {
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
-  auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
+  auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
 
   if (VecVT.getVectorElementType() == MVT::i1)
     return ExtractBitFromMaskVector(Op, DAG, Subtarget);
@@ -18833,10 +18830,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // |    |  Ports pressure in cycles   |  |
     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
     // ---------------------------------------------------------
-    // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
-    // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
-    // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
-    // Total Num Of Uops: 4
+    // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18],
+    // xmm0 |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18] |1
+    // |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1] Total Num
+    // Of Uops: 4
 
     return SDValue();
   }
@@ -18941,7 +18938,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // UNPCKHPD the element to the lowest double word, then movsd.
     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
-    int Mask[2] = { 1, -1 };
+    int Mask[2] = {1, -1};
     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getVectorIdxConstant(0, dl));
@@ -18966,9 +18963,10 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
     unsigned NumElts = VecVT.getVectorNumElements();
     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
-    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
-      DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
-      DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
+    SDValue ExtOp =
+        DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+                    DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
+                    DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
   }
 
@@ -18995,9 +18993,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 
   if (EltVT == MVT::bf16) {
     MVT IVT = VT.changeVectorElementTypeToInteger();
-    SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
-                              DAG.getBitcast(IVT, N0),
-                              DAG.getBitcast(MVT::i16, N1), N2);
+    SDValue Res =
+        DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT, DAG.getBitcast(IVT, N0),
+                    DAG.getBitcast(MVT::i16, N1), N2);
     return DAG.getBitcast(VT, Res);
   }
 
@@ -19258,8 +19256,9 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
 }
 
 // Returns the appropriate wrapper opcode for a global reference.
-unsigned X86TargetLowering::getGlobalWrapperKind(
-    const GlobalValue *GV, const unsigned char OpFlags) const {
+unsigned
+X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV,
+                                        const unsigned char OpFlags) const {
   // References to absolute symbols are never PC-relative.
   if (GV && GV->isAbsoluteSymbolRef())
     return X86ISD::Wrapper;
@@ -19283,8 +19282,8 @@ unsigned X86TargetLowering::getGlobalWrapperKind(
 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
 // be used to form addressing mode. These wrapped nodes will be selected
 // into MOV32ri.
-SDValue
-X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerConstantPool(SDValue Op,
+                                             SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
 
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
@@ -19334,11 +19333,10 @@ SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
 }
 
-SDValue
-X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerBlockAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
   // Create the TargetBlockAddressAddress node.
-  unsigned char OpFlags =
-    Subtarget.classifyBlockAddressReference();
+  unsigned char OpFlags = Subtarget.classifyBlockAddressReference();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   SDLoc dl(Op);
@@ -19443,8 +19441,8 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
   return Result;
 }
 
-SDValue
-X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
 }
 
@@ -19522,24 +19520,24 @@ static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA,
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
-static SDValue
-LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                                const EVT PtrVT) {
+static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA,
+                                               SelectionDAG &DAG,
+                                               const EVT PtrVT) {
   return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
                     /*LoadGlobalBaseReg=*/true);
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
-static SDValue
-LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                                const EVT PtrVT) {
+static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA,
+                                               SelectionDAG &DAG,
+                                               const EVT PtrVT) {
   return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
-static SDValue
-LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                                 const EVT PtrVT) {
+static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA,
+                                                SelectionDAG &DAG,
+                                                const EVT PtrVT) {
   return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
 }
 
@@ -19571,9 +19569,8 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
   // Build x at dtpoff.
   unsigned char OperandFlags = X86II::MO_DTPOFF;
   unsigned WrapperKind = X86ISD::Wrapper;
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
-                                           GA->getValueType(0),
-                                           GA->getOffset(), OperandFlags);
+  SDValue TGA = DAG.getTargetGlobalAddress(
+      GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
   // Add x at dtpoff with the base.
@@ -19614,9 +19611,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   // emit "addl x at ntpoff,%eax" (local exec)
   // or "addl x at indntpoff,%eax" (initial exec)
   // or "addl x at gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
-  SDValue TGA =
-      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
-                                 GA->getOffset(), OperandFlags);
+  SDValue TGA = DAG.getTargetGlobalAddress(
+      GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
   if (model == TLSModel::InitialExec) {
@@ -19635,8 +19631,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
 }
 
-SDValue
-X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                 SelectionDAG &DAG) const {
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
@@ -19650,20 +19646,20 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget.isTargetELF()) {
     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
     switch (model) {
-      case TLSModel::GeneralDynamic:
-        if (Subtarget.is64Bit()) {
-          if (Subtarget.isTarget64BitLP64())
-            return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
-          return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
-        }
-        return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
-      case TLSModel::LocalDynamic:
-        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
-                                           Subtarget.isTarget64BitLP64());
-      case TLSModel::InitialExec:
-      case TLSModel::LocalExec:
-        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
-                                   PositionIndependent);
+    case TLSModel::GeneralDynamic:
+      if (Subtarget.is64Bit()) {
+        if (Subtarget.isTarget64BitLP64())
+          return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+        return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
+      }
+      return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
+    case TLSModel::LocalDynamic:
+      return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
+                                         Subtarget.isTarget64BitLP64());
+    case TLSModel::InitialExec:
+    case TLSModel::LocalExec:
+      return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
+                                 PositionIndependent);
     }
     llvm_unreachable("Unknown TLS model.");
   }
@@ -19684,9 +19680,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
       WrapperKind = X86ISD::WrapperRIP;
     }
     SDLoc DL(Op);
-    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
-                                                GA->getValueType(0),
-                                                GA->getOffset(), OpFlag);
+    SDValue Result = DAG.getTargetGlobalAddress(
+        GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag);
     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
 
     // With PIC32, the address is actually $g + Offset.
@@ -19700,7 +19695,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     SDValue Chain = DAG.getEntryNode();
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
-    SDValue Args[] = { Chain, Offset };
+    SDValue Args[] = {Chain, Offset};
     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
     Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
 
@@ -19768,9 +19763,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
 
     // Get the offset of start of .tls section
-    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
-                                             GA->getValueType(0),
-                                             GA->getOffset(), X86II::MO_SECREL);
+    SDValue TGA =
+        DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+                                   GA->getOffset(), X86II::MO_SECREL);
     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
 
     // The address of the thread local variable is the add of the thread
@@ -19830,8 +19825,8 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl,
   MVT SrcVT = Src.getSimpleValueType();
   MVT VT = Op.getSimpleValueType();
 
-   if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
-       (VT != MVT::f32 && VT != MVT::f64))
+  if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
+      (VT != MVT::f32 && VT != MVT::f64))
     return SDValue();
 
   // Pack the i64 into a vector, do the operation and extract.
@@ -19896,22 +19891,22 @@ static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG,
 static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
                           const X86Subtarget &Subtarget) {
   switch (Opcode) {
-    case ISD::SINT_TO_FP:
-      // TODO: Handle wider types with AVX/AVX512.
-      if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
-        return false;
-      // CVTDQ2PS or (V)CVTDQ2PD
-      return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
-
-    case ISD::UINT_TO_FP:
-      // TODO: Handle wider types and i64 elements.
-      if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
-        return false;
-      // VCVTUDQ2PS or VCVTUDQ2PD
-      return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
+  case ISD::SINT_TO_FP:
+    // TODO: Handle wider types with AVX/AVX512.
+    if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
+      return false;
+    // CVTDQ2PS or (V)CVTDQ2PD
+    return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
 
-    default:
+  case ISD::UINT_TO_FP:
+    // TODO: Handle wider types and i64 elements.
+    if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
       return false;
+    // VCVTUDQ2PS or VCVTUDQ2PD
+    return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
+
+  default:
+    return false;
   }
 }
 
@@ -20055,7 +20050,7 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL,
     return SDValue();
 
   SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
-  SDValue One  = DAG.getConstant(1, DL, MVT::v4i64);
+  SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
   SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
                              DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
                              DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
@@ -20275,7 +20270,7 @@ std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
     Chain = Result.getValue(1);
   }
 
-  return { Result, Chain };
+  return {Result, Chain};
 }
 
 /// Horizontal vector math instructions may be slower than normal math with
@@ -20312,18 +20307,18 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl,
   LLVMContext *Context = DAG.getContext();
 
   // Build some magic constants.
-  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+  static const uint32_t CV0[] = {0x43300000, 0x45300000, 0, 0};
   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
 
-  SmallVector<Constant*,2> CV1;
+  SmallVector<Constant *, 2> CV1;
   CV1.push_back(
-    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
-                                      APInt(64, 0x4330000000000000ULL))));
+      ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
+                                        APInt(64, 0x4330000000000000ULL))));
   CV1.push_back(
-    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
-                                      APInt(64, 0x4530000000000000ULL))));
+      ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
+                                        APInt(64, 0x4530000000000000ULL))));
   Constant *C1 = ConstantVector::get(CV1);
   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
 
@@ -20344,11 +20339,10 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl,
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   SDValue Result;
 
-  if (Subtarget.hasSSE3() &&
-      shouldUseHorizontalOp(true, DAG, Subtarget)) {
+  if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) {
     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   } else {
-    SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
+    SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1, -1});
     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
   }
   Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
@@ -20374,8 +20368,7 @@ static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl,
 
   // Or the load with the bias.
   SDValue Or = DAG.getNode(
-      ISD::OR, dl, MVT::v2i64,
-      DAG.getBitcast(MVT::v2i64, Load),
+      ISD::OR, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, Load),
       DAG.getBitcast(MVT::v2i64,
                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
@@ -20579,8 +20572,9 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
     // Low will be bitcasted right away, so do not bother bitcasting back to its
     // original type.
-    Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
-                      VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
+    Low =
+        DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, VecCstLowBitcast,
+                    DAG.getTargetConstant(0xaa, DL, MVT::i8));
     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
     //                                 (uint4) 0x53000000, 0xaa);
     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
@@ -20588,7 +20582,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
     // High will be bitcasted right away, so do not bother bitcasting back to
     // its original type.
     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
-                       VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
+                       VecCstHighBitcast,
+                       DAG.getTargetConstant(0xaa, DL, MVT::i8));
   } else {
     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
@@ -20624,7 +20619,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
 }
 
-static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG,
+static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl,
+                                   SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
   SDValue N0 = Op.getOperand(OpNo);
@@ -20835,8 +20831,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     DstTy = MVT::i64;
   }
 
-  assert(DstTy.getSimpleVT() <= MVT::i64 &&
-         DstTy.getSimpleVT() >= MVT::i16 &&
+  assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 &&
          "Unknown FP_TO_INT to lower!");
 
   // We lower FP->int64 into FISTP64 followed by a load from a temporary
@@ -20874,8 +20869,8 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     bool LosesInfo = false;
     if (TheVT == MVT::f64)
       // The rounding mode is irrelevant as the conversion should be exact.
-      Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
-                              &LosesInfo);
+      Status = Thresh.convert(APFloat::IEEEdouble(),
+                              APFloat::rmNearestTiesToEven, &LosesInfo);
     else if (TheVT == MVT::f80)
       Status = Thresh.convert(APFloat::x87DoubleExtended(),
                               APFloat::rmNearestTiesToEven, &LosesInfo);
@@ -20885,8 +20880,8 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
 
     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
 
-    EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
-                                   *DAG.getContext(), TheVT);
+    EVT ResVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT);
     SDValue Cmp;
     if (IsStrict) {
       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
@@ -20915,8 +20910,8 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
                                    DAG.getConstantFP(0.0, DL, TheVT));
 
     if (IsStrict) {
-      Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
-                          { Chain, Value, FltOfs });
+      Value = DAG.getNode(ISD::STRICT_FSUB, DL, {TheVT, MVT::Other},
+                          {Chain, Value, FltOfs});
       Chain = Value.getValue(1);
     } else
       Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
@@ -20930,7 +20925,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
     Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
-    SDValue Ops[] = { Chain, StackSlot };
+    SDValue Ops[] = {Chain, StackSlot};
 
     unsigned FLDSize = TheVT.getStoreSize();
     assert(FLDSize <= MemSize && "Stack slot not big enough");
@@ -20943,10 +20938,9 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   // Build the FP_TO_INT*_IN_MEM
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
-  SDValue Ops[] = { Chain, Value, StackSlot };
-  SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
-                                         DAG.getVTList(MVT::Other),
-                                         Ops, DstTy, MMO);
+  SDValue Ops[] = {Chain, Value, StackSlot};
+  SDValue FIST = DAG.getMemIntrinsicNode(
+      X86ISD::FP_TO_INT_IN_MEM, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO);
 
   SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
   Chain = Res.getValue(1);
@@ -21125,7 +21119,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
     return In;
 
   unsigned NumElems = SrcVT.getVectorNumElements();
-  if (NumElems < 2 || !isPowerOf2_32(NumElems) )
+  if (NumElems < 2 || !isPowerOf2_32(NumElems))
     return SDValue();
 
   unsigned DstSizeInBits = DstVT.getSizeInBits();
@@ -21196,7 +21190,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
     // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
     SmallVector<int, 64> Mask;
     int Scale = 64 / OutVT.getScalarSizeInBits();
-    narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
+    narrowShuffleMaskElts(Scale, {0, 2, 1, 3}, Mask);
     Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
 
     if (DstVT.is256BitVector())
@@ -21440,14 +21434,12 @@ static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL,
       if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
         // We need to shift to get the lsb into sign position.
         // Shift packed bytes not supported natively, bitcast to word
-        MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
-        In = DAG.getNode(ISD::SHL, DL, ExtVT,
-                         DAG.getBitcast(ExtVT, In),
+        MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits() / 16);
+        In = DAG.getNode(ISD::SHL, DL, ExtVT, DAG.getBitcast(ExtVT, In),
                          DAG.getConstant(ShiftInx, DL, ExtVT));
         In = DAG.getBitcast(InVT, In);
       }
-      return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
-                          In, ISD::SETGT);
+      return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
     }
     // Use TESTD/Q, extended vector to packed dword/qword.
     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
@@ -21485,7 +21477,8 @@ static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL,
     // We either have 8 elements or we're allowed to use 512-bit vectors.
     // If we have VLX, we want to use the narrowest vector that can get the
     // job done so we use vXi32.
-    MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
+    MVT EltVT =
+        Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512 / NumElts);
     MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
     InVT = ExtVT;
@@ -21599,10 +21592,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
     if (Subtarget.hasInt256()) {
       // The PSHUFB mask:
-      static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
-                                      -1, -1, -1, -1, -1, -1, -1, -1,
-                                      16, 17, 20, 21, 24, 25, 28, 29,
-                                      -1, -1, -1, -1, -1, -1, -1, -1 };
+      static const int ShufMask1[] = {
+          0,  1,  4,  5,  8,  9,  12, 13, -1, -1, -1, -1, -1, -1, -1, -1,
+          16, 17, 20, 21, 24, 25, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1};
       In = DAG.getBitcast(MVT::v32i8, In);
       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
       In = DAG.getBitcast(MVT::v4i64, In);
@@ -21793,8 +21785,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
                           dl, {NVT, MVT::Other}, {Chain, Src});
         Chain = Res.getValue(1);
       } else {
-        Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
-                          NVT, Src);
+        Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, NVT,
+                          Src);
       }
 
       // TODO: Need to add exception check code for strict FP.
@@ -21896,8 +21888,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                 DAG.getUNDEF(MVT::v2f32));
       if (IsStrict) {
-        unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
-                                : X86ISD::STRICT_CVTTP2UI;
+        unsigned Opc =
+            IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
         return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
       }
       unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
@@ -22022,7 +22014,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
         makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
 
     if (IsStrict)
-      return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+      return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
 
     return Tmp.first;
   }
@@ -22085,7 +22077,7 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
     assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
     Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
-    SDValue Ops[] = { Chain, StackPtr };
+    SDValue Ops[] = {Chain, StackPtr};
 
     Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
                                   /*Align*/ std::nullopt,
@@ -22093,7 +22085,7 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
     Chain = Src.getValue(1);
   }
 
-  SDValue StoreOps[] = { Chain, Src, StackPtr };
+  SDValue StoreOps[] = {Chain, Src, StackPtr};
   Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
                                   StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
                                   MachineMemOperand::MOStore);
@@ -22101,8 +22093,8 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
   return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
 }
 
-SDValue
-X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
+                                              SelectionDAG &DAG) const {
   // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
   // but making use of X86 specifics to produce better instruction sequences.
   SDNode *Node = Op.getNode();
@@ -22164,12 +22156,12 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
   APFloat MinFloat(Sem);
   APFloat MaxFloat(Sem);
 
-  APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
-    MinInt, IsSigned, APFloat::rmTowardZero);
-  APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
-    MaxInt, IsSigned, APFloat::rmTowardZero);
-  bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
-                          && !(MaxStatus & APFloat::opStatus::opInexact);
+  APFloat::opStatus MinStatus =
+      MinFloat.convertFromAPInt(MinInt, IsSigned, APFloat::rmTowardZero);
+  APFloat::opStatus MaxStatus =
+      MaxFloat.convertFromAPInt(MaxInt, IsSigned, APFloat::rmTowardZero);
+  bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
+                             !(MaxStatus & APFloat::opStatus::opInexact);
 
   SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
   SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
@@ -22179,11 +22171,11 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
   if (AreExactFloatBounds) {
     if (DstVT != TmpVT) {
       // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
-      SDValue MinClamped = DAG.getNode(
-        X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
+      SDValue MinClamped =
+          DAG.getNode(X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
       // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
-      SDValue BothClamped = DAG.getNode(
-        X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
+      SDValue BothClamped =
+          DAG.getNode(X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
       // Convert clamped value to integer.
       SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
 
@@ -22193,11 +22185,11 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
     }
 
     // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
-    SDValue MinClamped = DAG.getNode(
-      X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
+    SDValue MinClamped =
+        DAG.getNode(X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
     // Clamp by MaxFloat from above. NaN cannot occur.
-    SDValue BothClamped = DAG.getNode(
-      X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
+    SDValue BothClamped =
+        DAG.getNode(X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
     // Convert clamped value to integer.
     SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
 
@@ -22209,8 +22201,8 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
 
     // Otherwise, select zero if Src is NaN.
     SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
-    return DAG.getSelectCC(
-      dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
+    return DAG.getSelectCC(dl, Src, Src, ZeroInt, FpToInt,
+                           ISD::CondCode::SETUO);
   }
 
   SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
@@ -22232,13 +22224,13 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
   if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
     // If Src ULT MinFloat, select MinInt. In particular, this also selects
     // MinInt if Src is NaN.
-    Select = DAG.getSelectCC(
-      dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
+    Select = DAG.getSelectCC(dl, Src, MinFloatNode, MinIntNode, Select,
+                             ISD::CondCode::SETULT);
   }
 
   // If Src OGT MaxFloat, select MaxInt.
-  Select = DAG.getSelectCC(
-    dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
+  Select = DAG.getSelectCC(dl, Src, MaxFloatNode, MaxIntNode, Select,
+                           ISD::CondCode::SETOGT);
 
   // In the unsigned case we are done, because we mapped NaN to MinInt, which
   // is already zero. The promoted case was already handled above.
@@ -22248,8 +22240,7 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
 
   // Otherwise, select 0 if Src is NaN.
   SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
-  return DAG.getSelectCC(
-    dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
+  return DAG.getSelectCC(dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
 }
 
 SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
@@ -22304,15 +22295,15 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
       Entry.IsZExt = true;
       Args.push_back(Entry);
 
-      SDValue Callee = DAG.getExternalSymbol(
-          getLibcallName(RTLIB::FPEXT_F16_F32),
-          getPointerTy(DAG.getDataLayout()));
+      SDValue Callee =
+          DAG.getExternalSymbol(getLibcallName(RTLIB::FPEXT_F16_F32),
+                                getPointerTy(DAG.getDataLayout()));
       CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
           CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
           std::move(Args));
 
       SDValue Res;
-      std::tie(Res,Chain) = LowerCallTo(CLI);
+      std::tie(Res, Chain) = LowerCallTo(CLI);
       if (IsStrict)
         Res = DAG.getMergeValues({Res, Chain}, DL);
 
@@ -22579,14 +22570,14 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL,
   // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
   unsigned HOpcode;
   switch (Op.getOpcode()) {
-  // clang-format off
+    // clang-format off
   case ISD::ADD: HOpcode = X86ISD::HADD; break;
   case ISD::SUB: HOpcode = X86ISD::HSUB; break;
   case ISD::FADD: HOpcode = X86ISD::FHADD; break;
   case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
   default:
     llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
-  // clang-format on
+    // clang-format on
   }
   unsigned LExtIndex = LHS.getConstantOperandVal(1);
   unsigned RExtIndex = RHS.getConstantOperandVal(1);
@@ -22644,7 +22635,7 @@ static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
   bool Ignored;
   APFloat Point5Pred = APFloat(0.5f);
   Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
-  Point5Pred.next(/*nextDown*/true);
+  Point5Pred.next(/*nextDown*/ true);
 
   SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
                               DAG.getConstantFP(Point5Pred, dl, VT), N0);
@@ -22694,16 +22685,16 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
 
   unsigned EltBits = VT.getScalarSizeInBits();
   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
-  APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
-                           APInt::getSignMask(EltBits);
+  APInt MaskElt =
+      IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
   const fltSemantics &Sem = VT.getFltSemantics();
   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
 
   SDValue Op0 = Op.getOperand(0);
   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
-  unsigned LogicOp = IsFABS  ? X86ISD::FAND :
-                     IsFNABS ? X86ISD::FOR  :
-                               X86ISD::FXOR;
+  unsigned LogicOp = IsFABS    ? X86ISD::FAND
+                     : IsFNABS ? X86ISD::FOR
+                               : X86ISD::FXOR;
   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
 
   if (VT.isVector() || IsF128)
@@ -22806,7 +22797,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
 }
 
 /// Helper for attempting to create a X86ISD::BT node.
-static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
+static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL,
+                     SelectionDAG &DAG) {
   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
   // instruction.  Since the shift amount is in-range-or-undefined, we know
   // that doing a bittest on the i32 value is ok.  We extend to i32 because
@@ -23422,8 +23414,7 @@ static bool hasNonFlagsUse(SDValue Op) {
 // the node alone and emit a 'cmp' or 'test' instruction.
 static bool isProfitableToUseFlagOp(SDValue Op) {
   for (SDNode *U : Op->users())
-    if (U->getOpcode() != ISD::CopyToReg &&
-        U->getOpcode() != ISD::SETCC &&
+    if (U->getOpcode() != ISD::CopyToReg && U->getOpcode() != ISD::SETCC &&
         U->getOpcode() != ISD::STORE)
       return false;
 
@@ -23439,14 +23430,20 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
   bool NeedCF = false;
   bool NeedOF = false;
   switch (X86CC) {
-  default: break;
-  case X86::COND_A: case X86::COND_AE:
-  case X86::COND_B: case X86::COND_BE:
+  default:
+    break;
+  case X86::COND_A:
+  case X86::COND_AE:
+  case X86::COND_B:
+  case X86::COND_BE:
     NeedCF = true;
     break;
-  case X86::COND_G: case X86::COND_GE:
-  case X86::COND_L: case X86::COND_LE:
-  case X86::COND_O: case X86::COND_NO: {
+  case X86::COND_G:
+  case X86::COND_GE:
+  case X86::COND_L:
+  case X86::COND_LE:
+  case X86::COND_O:
+  case X86::COND_NO: {
     // Check if we really need to set the
     // Overflow flag. If NoSignedWrap is present
     // that is not actually needed.
@@ -23498,14 +23495,14 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
 
     // Otherwise use a regular EFLAGS-setting instruction.
     switch (ArithOp.getOpcode()) {
-    // clang-format off
+      // clang-format off
     default: llvm_unreachable("unexpected operator!");
     case ISD::ADD: Opcode = X86ISD::ADD; break;
     case ISD::SUB: Opcode = X86ISD::SUB; break;
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
     case ISD::OR:  Opcode = X86ISD::OR;  break;
-    // clang-format on
+      // clang-format on
     }
 
     NumOperands = 2;
@@ -23520,8 +23517,9 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
   case ISD::USUBO: {
     // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-    return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
-                       Op->getOperand(1)).getValue(1);
+    return DAG
+        .getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0), Op->getOperand(1))
+        .getValue(1);
   }
   default:
     break;
@@ -23550,8 +23548,9 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC,
 
   EVT CmpVT = Op0.getValueType();
 
-  assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
-          CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
+  assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 ||
+          CmpVT == MVT::i64) &&
+         "Unexpected VT!");
 
   // Only promote the compare up to I32 if it is a 16 bit operation
   // with an immediate. 16 bit immediates are to be avoided unless the target
@@ -23678,9 +23677,8 @@ bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
 
 /// The minimum architected relative accuracy is 2^-12. We need one
 /// Newton-Raphson step to have a good float result (24 bits of precision).
-SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
-                                           SelectionDAG &DAG, int Enabled,
-                                           int &RefinementSteps,
+SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, SelectionDAG &DAG,
+                                           int Enabled, int &RefinementSteps,
                                            bool &UseOneConstNR,
                                            bool Reciprocal) const {
   SDLoc DL(Op);
@@ -23787,9 +23785,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
 /// This is because we still need one division to calculate the reciprocal and
 /// then we need two multiplies by that reciprocal as replacements for the
 /// original divisions.
-unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
-  return 2;
-}
+unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; }
 
 SDValue
 X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
@@ -23797,7 +23793,7 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                  SmallVectorImpl<SDNode *> &Created) const {
   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
   if (isIntDivCheap(N->getValueType(0), Attr))
-    return SDValue(N,0); // Lower SDIV as SDIV
+    return SDValue(N, 0); // Lower SDIV as SDIV
 
   assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
          "Unexpected divisor!");
@@ -23866,8 +23862,8 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
       if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
           isPowerOf2_64(AndRHSVal)) {
         Src = AndLHS;
-        BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
-                                Src.getValueType());
+        BitNo =
+            DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, Src.getValueType());
       }
     }
   }
@@ -23913,7 +23909,7 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
   //  6 - NLE
   //  7 - ORD
   switch (SetCCOpcode) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETOEQ:
   case ISD::SETEQ:  SSECC = 0; break;
@@ -23935,7 +23931,7 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
   case ISD::SETO:   SSECC = 7; break;
   case ISD::SETUEQ: SSECC = 8; break;
   case ISD::SETONE: SSECC = 12; break;
-  // clang-format on
+    // clang-format on
   }
   if (Swap)
     std::swap(Op0, Op1);
@@ -24220,13 +24216,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
           Cmp1 = DAG.getNode(
               Opc, dl, {VT, MVT::Other},
               {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
-          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
-                              Cmp1.getValue(1));
+          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                              Cmp0.getValue(1), Cmp1.getValue(1));
         } else {
-          Cmp0 = DAG.getNode(
-              Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
-          Cmp1 = DAG.getNode(
-              Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
+          Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                             DAG.getTargetConstant(CC0, dl, MVT::i8));
+          Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                             DAG.getTargetConstant(CC1, dl, MVT::i8));
         }
         Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
       } else {
@@ -24236,8 +24232,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
               {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
           Chain = Cmp.getValue(1);
         } else
-          Cmp = DAG.getNode(
-              Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
+          Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                            DAG.getTargetConstant(SSECC, dl, MVT::i8));
       }
     } else {
       // Handle all other FP comparisons here.
@@ -24249,8 +24245,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
             {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
         Chain = Cmp.getValue(1);
       } else
-        Cmp = DAG.getNode(
-            Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
+        Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                          DAG.getTargetConstant(SSECC, dl, MVT::i8));
     }
 
     if (VT.getFixedSizeInBits() >
@@ -24301,7 +24297,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     // Translate compare code to XOP PCOM compare mode.
     unsigned CmpMode = 0;
     switch (Cond) {
-    // clang-format off
+      // clang-format off
     default: llvm_unreachable("Unexpected SETCC condition");
     case ISD::SETULT:
     case ISD::SETLT: CmpMode = 0x00; break;
@@ -24313,7 +24309,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     case ISD::SETGE: CmpMode = 0x03; break;
     case ISD::SETEQ: CmpMode = 0x04; break;
     case ISD::SETNE: CmpMode = 0x05; break;
-    // clang-format on
+      // clang-format on
     }
 
     // Are we comparing unsigned or signed integers?
@@ -24411,13 +24407,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     bool Invert = false;
     unsigned Opc;
     switch (Cond) {
-    // clang-format off
+      // clang-format off
     default: llvm_unreachable("Unexpected condition code");
     case ISD::SETUGT: Invert = true; [[fallthrough]];
     case ISD::SETULE: Opc = ISD::UMIN; break;
     case ISD::SETULT: Invert = true; [[fallthrough]];
     case ISD::SETUGE: Opc = ISD::UMAX; break;
-    // clang-format on
+      // clang-format on
     }
 
     SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
@@ -24441,10 +24437,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   // operations may be required for some comparisons.
   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
                                                             : X86ISD::PCMPGT;
-  bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
-              Cond == ISD::SETGE || Cond == ISD::SETUGE;
-  bool Invert = Cond == ISD::SETNE ||
-                (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
+  bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || Cond == ISD::SETGE ||
+              Cond == ISD::SETUGE;
+  bool Invert =
+      Cond == ISD::SETNE || (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
 
   if (Swap)
     std::swap(Op0, Op1);
@@ -24462,7 +24458,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
         Op1 = DAG.getBitcast(MVT::v4i32, Op1);
 
         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
-        static const int MaskHi[] = { 1, 1, 3, 3 };
+        static const int MaskHi[] = {1, 1, 3, 3};
         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
 
         return DAG.getBitcast(VT, Result);
@@ -24473,7 +24469,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
         Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
 
         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
-        static const int MaskHi[] = { 1, 1, 3, 3 };
+        static const int MaskHi[] = {1, 1, 3, 3};
         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
 
         return DAG.getBitcast(VT, Result);
@@ -24512,8 +24508,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
 
       // Create masks for only the low parts/high parts of the 64 bit integers.
-      static const int MaskHi[] = { 1, 1, 3, 3 };
-      static const int MaskLo[] = { 0, 0, 2, 2 };
+      static const int MaskHi[] = {1, 1, 3, 3};
+      static const int MaskLo[] = {0, 0, 2, 2};
       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
@@ -24540,7 +24536,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
 
       // Make sure the lower and upper halves are both all-ones.
-      static const int Mask[] = { 1, 0, 3, 2 };
+      static const int Mask[] = {1, 0, 3, 2};
       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
 
@@ -24555,8 +24551,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   // bits of the inputs before performing those operations.
   if (FlipSigns) {
     MVT EltVT = VT.getVectorElementType();
-    SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
-                                 VT);
+    SDValue SM =
+        DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl, VT);
     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
   }
@@ -24573,8 +24569,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
 // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
 static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
                               const SDLoc &dl, SelectionDAG &DAG,
-                              const X86Subtarget &Subtarget,
-                              SDValue &X86CC) {
+                              const X86Subtarget &Subtarget, SDValue &X86CC) {
   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
 
   // Must be a bitcast from vXi1.
@@ -24721,7 +24716,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
                   Op.getOpcode() == ISD::STRICT_FSETCCS;
   MVT VT = Op->getSimpleValueType(0);
 
-  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+  if (VT.isVector())
+    return LowerVSETCC(Op, Subtarget, DAG);
 
   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
@@ -24816,7 +24812,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
 }
 
-SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op,
+                                           SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue Carry = Op.getOperand(2);
@@ -24828,8 +24825,8 @@ SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const
 
   // Recreate the carry if needed.
   EVT CarryVT = Carry.getValueType();
-  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
-                      Carry, DAG.getAllOnesConstant(DL, CarryVT));
+  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry,
+                      DAG.getAllOnesConstant(DL, CarryVT));
 
   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
@@ -24849,7 +24846,8 @@ getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
   unsigned BaseOp = 0;
   SDLoc DL(Op);
   switch (Op.getOpcode()) {
-  default: llvm_unreachable("Unknown ovf instruction!");
+  default:
+    llvm_unreachable("Unknown ovf instruction!");
   case ISD::SADDO:
     BaseOp = X86ISD::ADD;
     Cond = X86::COND_O;
@@ -24923,7 +24921,8 @@ static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   SDValue VOp0 = V.getOperand(0);
   unsigned InBits = VOp0.getValueSizeInBits();
   unsigned Bits = V.getValueSizeInBits();
-  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+  return DAG.MaskedValueIsZero(VOp0,
+                               APInt::getHighBitsSet(InBits, InBits - Bits));
 }
 
 // Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
@@ -25061,7 +25060,7 @@ static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS,
 
 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   bool AddTest = true;
-  SDValue Cond  = Op.getOperand(0);
+  SDValue Cond = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue Op2 = Op.getOperand(2);
   SDLoc DL(Op);
@@ -25212,14 +25211,13 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
   unsigned CondOpcode = Cond.getOpcode();
-  if (CondOpcode == X86ISD::SETCC ||
-      CondOpcode == X86ISD::SETCC_CARRY) {
+  if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) {
     CC = Cond.getOperand(0);
 
     SDValue Cmp = Cond.getOperand(1);
     bool IllegalFPCMov = false;
-    if (VT.isFloatingPoint() && !VT.isVector() &&
-        !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV())  // FPStack?
+    if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT) &&
+        Subtarget.canUseCMOV()) // FPStack?
       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
 
     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
@@ -25282,14 +25280,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
   // widen the cmov and push the truncate through. This avoids introducing a new
   // branch during isel and doesn't add any extensions.
-  if (Op.getValueType() == MVT::i8 &&
-      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+  if (Op.getValueType() == MVT::i8 && Op1.getOpcode() == ISD::TRUNCATE &&
+      Op2.getOpcode() == ISD::TRUNCATE) {
     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
     if (T1.getValueType() == T2.getValueType() &&
         // Exclude CopyFromReg to avoid partial register stalls.
-        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
-      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
-                                 CC, Cond);
+        T1.getOpcode() != ISD::CopyFromReg &&
+        T2.getOpcode() != ISD::CopyFromReg) {
+      SDValue Cmov =
+          DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1, CC, Cond);
       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
     }
   }
@@ -25305,14 +25304,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
        !X86::mayFoldLoad(Op2, Subtarget))) {
     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
-    SDValue Ops[] = { Op2, Op1, CC, Cond };
+    SDValue Ops[] = {Op2, Op1, CC, Cond};
     SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
     return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
   }
 
   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   // condition is true.
-  SDValue Ops[] = { Op2, Op1, CC, Cond };
+  SDValue Ops[] = {Op2, Op1, CC, Cond};
   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
 }
 
@@ -25372,7 +25371,7 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,
 }
 
 SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Cond = Op.getOperand(0); // condition
+  SDValue Cond = Op.getOperand(0);    // condition
   SDValue TrueOp = Op.getOperand(1);  // true_value
   SDValue FalseOp = Op.getOperand(2); // false_value
   SDLoc DL(Op);
@@ -25533,6 +25532,69 @@ SDValue X86TargetLowering::LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getBitcast(VT, CtSelect);
   }
 
+  // Handle f80 types by splitting into three 32-bit chunks
+  if (VT == MVT::f80) {
+    SDValue Chain = DAG.getEntryNode();
+
+    // Create temporary stack slots for input f80 values
+    SDValue TrueSlot = DAG.CreateStackTemporary(MVT::f80);
+    SDValue FalseSlot = DAG.CreateStackTemporary(MVT::f80);
+
+    // Store f80 values to memory
+    SDValue StoreTrueF80 =
+        DAG.getStore(Chain, DL, TrueOp, TrueSlot, MachinePointerInfo());
+    SDValue StoreFalseF80 =
+        DAG.getStore(Chain, DL, FalseOp, FalseSlot, MachinePointerInfo());
+
+    // Load i32 parts from memory (3 chunks for 96-bit f80 storage)
+    SDValue TruePart0 =
+        DAG.getLoad(MVT::i32, DL, StoreTrueF80, TrueSlot, MachinePointerInfo());
+    SDValue TruePart1Ptr =
+        DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(4), DL);
+    SDValue TruePart1 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart1Ptr,
+                                    MachinePointerInfo());
+    SDValue TruePart2Ptr =
+        DAG.getMemBasePlusOffset(TrueSlot, TypeSize::getFixed(8), DL);
+    SDValue TruePart2 = DAG.getLoad(MVT::i32, DL, StoreTrueF80, TruePart2Ptr,
+                                    MachinePointerInfo());
+
+    SDValue FalsePart0 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalseSlot,
+                                     MachinePointerInfo());
+    SDValue FalsePart1Ptr =
+        DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(4), DL);
+    SDValue FalsePart1 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart1Ptr,
+                                     MachinePointerInfo());
+    SDValue FalsePart2Ptr =
+        DAG.getMemBasePlusOffset(FalseSlot, TypeSize::getFixed(8), DL);
+    SDValue FalsePart2 = DAG.getLoad(MVT::i32, DL, StoreFalseF80, FalsePart2Ptr,
+                                     MachinePointerInfo());
+
+    // Perform CTSELECT on each 32-bit chunk
+    SDValue Part0Ops[] = {FalsePart0, TruePart0, CC, ProcessedCond};
+    SDValue Part0Select = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Part0Ops);
+    SDValue Part1Ops[] = {FalsePart1, TruePart1, CC, ProcessedCond};
+    SDValue Part1Select = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Part1Ops);
+    SDValue Part2Ops[] = {FalsePart2, TruePart2, CC, ProcessedCond};
+    SDValue Part2Select = DAG.getNode(X86ISD::CTSELECT, DL, MVT::i32, Part2Ops);
+
+    // Create result stack slot and store the selected parts
+    SDValue ResultSlot = DAG.CreateStackTemporary(MVT::f80);
+    SDValue StorePart0 =
+        DAG.getStore(Chain, DL, Part0Select, ResultSlot, MachinePointerInfo());
+    SDValue ResPart1Ptr =
+        DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(4), DL);
+    SDValue StorePart1 = DAG.getStore(StorePart0, DL, Part1Select, ResPart1Ptr,
+                                      MachinePointerInfo());
+    SDValue ResPart2Ptr =
+        DAG.getMemBasePlusOffset(ResultSlot, TypeSize::getFixed(8), DL);
+    SDValue StorePart2 = DAG.getStore(StorePart1, DL, Part2Select, ResPart2Ptr,
+                                      MachinePointerInfo());
+
+    // Load complete f80 result from memory
+    return DAG.getLoad(MVT::f80, DL, StorePart2, ResultSlot,
+                       MachinePointerInfo());
+  }
+
   // Create final CTSELECT node
   SDValue Ops[] = {FalseOp, TrueOp, CC, ProcessedCond};
   return DAG.getNode(X86ISD::CTSELECT, DL, Op.getValueType(), Ops,
@@ -25590,9 +25652,9 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
     InVT = In.getSimpleValueType();
   }
 
-  // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
-  // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
-  // need to be handled here for 256/512-bit results.
+  // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit
+  // results, so are legal and shouldn't occur here. AVX2/AVX512 pmovsx*
+  // instructions still need to be handled here for 256/512-bit results.
   if (Subtarget.hasInt256()) {
     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
 
@@ -25601,9 +25663,8 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
 
     // FIXME: Apparently we create inreg operations that could be regular
     // extends.
-    unsigned ExtOpc =
-        Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
-                                             : ISD::ZERO_EXTEND;
+    unsigned ExtOpc = Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
+                                                           : ISD::ZERO_EXTEND;
     return DAG.getNode(ExtOpc, dl, VT, In);
   }
 
@@ -25721,9 +25782,9 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
 
   unsigned NumElems = InVT.getVectorNumElements();
-  SmallVector<int,8> ShufMask(NumElems, -1);
-  for (unsigned i = 0; i != NumElems/2; ++i)
-    ShufMask[i] = i + NumElems/2;
+  SmallVector<int, 8> ShufMask(NumElems, -1);
+  for (unsigned i = 0; i != NumElems / 2; ++i)
+    ShufMask[i] = i + NumElems / 2;
 
   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
   OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
@@ -25885,11 +25946,10 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
 // TODO: It is possible to support ZExt by zeroing the undef values during
 // the shuffle phase or after the shuffle.
 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
-                                 SelectionDAG &DAG) {
+                         SelectionDAG &DAG) {
   MVT RegVT = Op.getSimpleValueType();
   assert(RegVT.isVector() && "We only custom lower vector loads.");
-  assert(RegVT.isInteger() &&
-         "We only custom lower integer vector loads.");
+  assert(RegVT.isInteger() && "We only custom lower integer vector loads.");
 
   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   SDLoc dl(Ld);
@@ -25932,8 +25992,8 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
 
 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
-  SDValue Cond  = Op.getOperand(1);
-  SDValue Dest  = Op.getOperand(2);
+  SDValue Cond = Op.getOperand(1);
+  SDValue Dest = Op.getOperand(2);
   SDLoc dl(Op);
 
   // Bail out when we don't have native compare instructions.
@@ -25983,7 +26043,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
         if (User->getOpcode() == ISD::BR) {
           SDValue FalseBB = User->getOperand(1);
           SDNode *NewBR =
-            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
           assert(NewBR == User);
           (void)NewBR;
           Dest = FalseBB;
@@ -26054,9 +26114,8 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
 // that the guard pages used by the OS virtual memory manager are allocated in
 // correct sequence.
-SDValue
-X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
-                                           SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool SplitStack = MF.shouldSplitStack();
   bool EmitStackProbeCall = hasStackProbeSymbol(MF);
@@ -26067,7 +26126,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   // Get the inputs.
   SDNode *Node = Op.getNode();
   SDValue Chain = Op.getOperand(0);
-  SDValue Size  = Op.getOperand(1);
+  SDValue Size = Op.getOperand(1);
   MaybeAlign Alignment(Op.getConstantOperandVal(2));
   EVT VT = Node->getValueType(0);
 
@@ -26190,8 +26249,9 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MemOps.push_back(Store);
 
   // Store ptr to reg_save_area.
-  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
-      Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
+  FIN = DAG.getNode(
+      ISD::ADD, DL, PtrVT, FIN,
+      DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
   Store = DAG.getStore(
       Op.getOperand(0), DL, RSFIN, FIN,
@@ -26201,8 +26261,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
-  assert(Subtarget.is64Bit() &&
-         "LowerVAARG only handles 64-bit va_arg!");
+  assert(Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!");
   assert(Op.getNumOperands() == 4);
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -26226,11 +26285,11 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   // selection mechanism works only for the basic types.
   assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
   if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
-    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
+    ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
   } else {
     assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
            "Unhandled argument type in LowerVAARG");
-    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
+    ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
   }
 
   if (ArgMode == 2) {
@@ -26264,7 +26323,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
   // where a va_list is still an i8*.
   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
   if (Subtarget.isCallingConvWin64(
-        DAG.getMachineFunction().getFunction().getCallingConv()))
+          DAG.getMachineFunction().getFunction().getCallingConv()))
     // Probably a Win64 va_copy.
     return DAG.expandVACopy(Op.getNode());
 
@@ -26326,15 +26385,17 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
       return DAG.getConstant(0, dl, VT);
   }
 
-  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
-         && "Unknown target vector shift-by-constant node");
+  assert(
+      (Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) &&
+      "Unknown target vector shift-by-constant node");
 
   // Fold this packed vector shift into a build vector if SrcOp is a
   // vector of Constants or UNDEFs.
   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
     unsigned ShiftOpc;
     switch (Opc) {
-    default: llvm_unreachable("Unknown opcode!");
+    default:
+      llvm_unreachable("Unknown opcode!");
     case X86ISD::VSHLI:
       ShiftOpc = ISD::SHL;
       break;
@@ -26474,8 +26535,8 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
     Hi = DAG.getBitcast(MVT::v32i1, Hi);
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
   } else {
-    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                     Mask.getSimpleValueType().getSizeInBits());
+    MVT BitcastVT =
+        MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits());
     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
     // are extracted by EXTRACT_SUBVECTOR.
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
@@ -26556,9 +26617,12 @@ static int getSEHRegistrationNodeSize(const Function *Fn) {
   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
   // WinEHStatePass for the full struct definition.
   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
-  case EHPersonality::MSVC_X86SEH: return 24;
-  case EHPersonality::MSVC_CXX: return 16;
-  default: break;
+  case EHPersonality::MSVC_X86SEH:
+    return 24;
+  case EHPersonality::MSVC_CXX:
+    return 16;
+  default:
+    break;
   }
   report_fatal_error(
       "can only recover FP for 32-bit MSVC EH personality functions");
@@ -26648,13 +26712,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   SDLoc dl(Op);
   unsigned IntNo = Op.getConstantOperandVal(0);
   MVT VT = Op.getSimpleValueType();
-  const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
+  const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
 
   // Propagate flags from original node to transformed node(s).
   SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
 
   if (IntrData) {
-    switch(IntrData->Type) {
+    switch (IntrData->Type) {
     case INTR_TYPE_1OP: {
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
@@ -26780,9 +26844,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         if (!isRoundModeCurDirection(Rnd))
           return SDValue();
       }
-      return getVectorMaskingNode(
-          DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
-          Subtarget, DAG);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+                                  Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_1OP_MASK_SAE: {
       SDValue Src = Op.getOperand(1);
@@ -26823,9 +26886,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
           if (!isRoundModeCurDirection(Rnd))
             return SDValue();
         }
-        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
-                                                Src2),
-                                    Mask, passThru, Subtarget, DAG);
+        return getScalarMaskingNode(
+            DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), Mask, passThru,
+            Subtarget, DAG);
       }
 
       assert(Op.getNumOperands() == (6U + HasRounding) &&
@@ -26839,9 +26902,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         else if (!isRoundModeCurDirection(Sae))
           return SDValue();
       }
-      return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
-                                              Src2, RoundingMode),
-                                  Mask, passThru, Subtarget, DAG);
+      return getScalarMaskingNode(
+          DAG.getNode(Opc, dl, VT, Src1, Src2, RoundingMode), Mask, passThru,
+          Subtarget, DAG);
     }
     case INTR_TYPE_SCALAR_MASK_RND: {
       SDValue Src1 = Op.getOperand(1);
@@ -26876,8 +26939,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       else
         return SDValue();
 
-      return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
-                                  Mask, passThru, Subtarget, DAG);
+      return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask,
+                                  passThru, Subtarget, DAG);
     }
     case INTR_TYPE_2OP_MASK: {
       SDValue Src1 = Op.getOperand(1);
@@ -26913,8 +26976,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
           return SDValue();
       }
 
-      return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
-                                  Mask, PassThru, Subtarget, DAG);
+      return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask,
+                                  PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
       SDValue Src1 = Op.getOperand(1);
@@ -26963,12 +27026,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       // Reverse the operands to match VSELECT order.
       return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
     }
-    case VPERM_2OP : {
+    case VPERM_2OP: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
 
       // Swap Src1 and Src2 in the node creation
-      return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
+      return DAG.getNode(IntrData->Opc0, dl, VT, Src2, Src1);
     }
     case CFMA_OP_MASKZ:
     case CFMA_OP_MASK: {
@@ -27012,8 +27075,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       SDValue Imm = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
-      SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
-                                                 Subtarget, DAG);
+      SDValue FPclassMask =
+          getScalarMaskingNode(FPclass, Mask, SDValue(), Subtarget, DAG);
       // Need to fill with zeros to ensure the bitcast will produce zeroes
       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
@@ -27037,7 +27100,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         if (!isRoundModeCurDirection(Sae))
           return SDValue();
       }
-      //default rounding mode
+      // default rounding mode
       return DAG.getNode(IntrData->Opc0, dl, MaskVT,
                          {Op.getOperand(1), Op.getOperand(2), CC, Mask});
     }
@@ -27055,12 +27118,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         else if (!isRoundModeCurDirection(Sae))
           return SDValue();
       }
-      //default rounding mode
+      // default rounding mode
       if (!Cmp.getNode())
         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
 
-      SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
-                                             Subtarget, DAG);
+      SDValue CmpMask =
+          getScalarMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG);
       // Need to fill with zeros to ensure the bitcast will produce zeroes
       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
@@ -27228,8 +27291,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
 
       uint64_t Imm = Op.getConstantOperandVal(2);
-      SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
-                                              Op.getValueType());
+      SDValue Control =
+          DAG.getTargetConstant(Imm & 0xffff, dl, Op.getValueType());
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          Op.getOperand(1), Control);
     }
@@ -27251,7 +27314,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                           Op.getOperand(3), GenCF.getValue(1));
       }
       SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
-      SDValue Results[] = { SetCC, Res };
+      SDValue Results[] = {SetCC, Res};
       return DAG.getMergeValues(Results, dl);
     }
     case CVTPD2PS_MASK:
@@ -27334,7 +27397,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
 
   switch (IntNo) {
-  default: return SDValue();    // Don't custom lower most intrinsics.
+  default:
+    return SDValue(); // Don't custom lower most intrinsics.
 
   // ptest and testp intrinsics. The intrinsic these come from are designed to
   // return an integer value, not just an instruction so lower it to the ptest
@@ -27368,7 +27432,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     unsigned TestOpc = X86ISD::PTEST;
     X86::CondCode X86CC;
     switch (IntNo) {
-    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+    default:
+      llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
     case Intrinsic::x86_avx512_ktestc_b:
     case Intrinsic::x86_avx512_ktestc_w:
     case Intrinsic::x86_avx512_ktestc_d:
@@ -27439,7 +27504,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     unsigned Opcode;
     X86::CondCode X86CC;
     switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    default:
+      llvm_unreachable("Impossible intrinsic"); // Can't reach here.
     case Intrinsic::x86_sse42_pcmpistria128:
       Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_A;
@@ -27609,7 +27675,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     unsigned NewIntrinsic;
     switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    default:
+      llvm_unreachable("Impossible intrinsic"); // Can't reach here.
     case Intrinsic::x86_mmx_pslli_w:
       NewIntrinsic = Intrinsic::x86_mmx_psll_w;
       break;
@@ -27686,16 +27753,16 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
 
   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
 
-  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
   SDValue Res =
       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
 }
 
-static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
-                             SDValue Src, SDValue Mask, SDValue Base,
-                             SDValue Index, SDValue ScaleOp, SDValue Chain,
+static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src,
+                             SDValue Mask, SDValue Base, SDValue Index,
+                             SDValue ScaleOp, SDValue Chain,
                              const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
@@ -27724,7 +27791,7 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
 
   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
 
-  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
   SDValue Res =
       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
@@ -27732,9 +27799,9 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
 }
 
 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
-                               SDValue Src, SDValue Mask, SDValue Base,
-                               SDValue Index, SDValue ScaleOp, SDValue Chain,
-                               const X86Subtarget &Subtarget) {
+                              SDValue Src, SDValue Mask, SDValue Base,
+                              SDValue Index, SDValue ScaleOp, SDValue Chain,
+                              const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   // Scale must be constant.
@@ -27776,8 +27843,8 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                         TLI.getPointerTy(DAG.getDataLayout()));
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
-  MVT MaskVT =
-    MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+  MVT MaskVT = MVT::getVectorVT(
+      MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
@@ -27793,11 +27860,11 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
 /// expanded intrinsics implicitly defines extra registers (i.e. not just
 /// EDX:EAX).
 static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
-                                        SelectionDAG &DAG,
-                                        unsigned TargetOpcode,
-                                        unsigned SrcReg,
-                                        const X86Subtarget &Subtarget,
-                                        SmallVectorImpl<SDValue> &Results) {
+                                           SelectionDAG &DAG,
+                                           unsigned TargetOpcode,
+                                           unsigned SrcReg,
+                                           const X86Subtarget &Subtarget,
+                                           SmallVectorImpl<SDValue> &Results) {
   SDValue Chain = N->getOperand(0);
   SDValue Glue;
 
@@ -27837,7 +27904,7 @@ static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
   }
 
   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
-  SDValue Ops[] = { LO, HI };
+  SDValue Ops[] = {LO, HI};
   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   Results.push_back(Pair);
   Results.push_back(Chain);
@@ -27854,9 +27921,9 @@ static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
   // and the EAX register is loaded with the low-order 32 bits.
-  SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
-                                             /* NoRegister */0, Subtarget,
-                                             Results);
+  SDValue Glue =
+      expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
+                                  /* NoRegister */ 0, Subtarget, Results);
   if (Opcode != X86::RDTSCP)
     return;
 
@@ -27914,24 +27981,24 @@ static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
 }
 
 /// Emit Truncating Store with signed or unsigned saturation.
-static SDValue
-EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
-                SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
-                SelectionDAG &DAG) {
+static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL,
+                               SDValue Val, SDValue Ptr, EVT MemVT,
+                               MachineMemOperand *MMO, SelectionDAG &DAG) {
   SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
-  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  SDValue Ops[] = {Chain, Val, Ptr, Undef};
   unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
   return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
 }
 
 /// Emit Masked Truncating Store with signed or unsigned saturation.
 static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
-                                     const SDLoc &DL,
-                      SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
-                      MachineMemOperand *MMO, SelectionDAG &DAG) {
+                                     const SDLoc &DL, SDValue Val, SDValue Ptr,
+                                     SDValue Mask, EVT MemVT,
+                                     MachineMemOperand *MMO,
+                                     SelectionDAG &DAG) {
   SDVTList VTs = DAG.getVTList(MVT::Other);
-  SDValue Ops[] = { Chain, Val, Ptr, Mask };
+  SDValue Ops[] = {Chain, Val, Ptr, Mask};
   unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
   return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
 }
@@ -27999,9 +28066,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       SDLoc dl(Op);
       // Create a WRPKRU node, pass the input to the EAX parameter,  and pass 0
       // to the EDX and ECX parameters.
-      return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
-                         Op.getOperand(0), Op.getOperand(2),
-                         DAG.getConstant(0, dl, MVT::i32),
+      return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other, Op.getOperand(0),
+                         Op.getOperand(2), DAG.getConstant(0, dl, MVT::i32),
                          DAG.getConstant(0, dl, MVT::i32));
     }
     case llvm::Intrinsic::asan_check_memaccess: {
@@ -28032,7 +28098,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       unsigned Opcode;
 
       switch (IntNo) {
-      default: llvm_unreachable("Impossible intrinsic");
+      default:
+        llvm_unreachable("Impossible intrinsic");
       case Intrinsic::x86_umwait:
         Opcode = X86ISD::UMWAIT;
         break;
@@ -28045,9 +28112,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
         break;
       }
 
-      SDValue Operation =
-          DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
-                      Op->getOperand(3), Op->getOperand(4));
+      SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
+                                      Op->getOperand(3), Op->getOperand(4));
       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
                          Operation.getValue(1));
@@ -28059,7 +28125,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
       unsigned Opcode;
       switch (IntNo) {
-      default: llvm_unreachable("Impossible intrinsic!");
+      default:
+        llvm_unreachable("Impossible intrinsic!");
       case Intrinsic::x86_enqcmd:
         Opcode = X86ISD::ENQCMD;
         break;
@@ -28083,7 +28150,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       unsigned Opcode;
 
       switch (IntNo) {
-      default: llvm_unreachable("Impossible intrinsic");
+      default:
+        llvm_unreachable("Impossible intrinsic");
       case Intrinsic::x86_aesenc128kl:
         Opcode = X86ISD::AESENC128KL;
         break;
@@ -28121,7 +28189,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       unsigned Opcode;
 
       switch (IntNo) {
-      default: llvm_unreachable("Impossible intrinsic");
+      default:
+        llvm_unreachable("Impossible intrinsic");
       case Intrinsic::x86_aesencwide128kl:
         Opcode = X86ISD::AESENCWIDE128KL;
         break;
@@ -28215,9 +28284,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       SDValue Src2 = Op.getOperand(4);
       SDValue CC = Op.getOperand(5);
       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
-      SDValue Operation = DAG.getMemIntrinsicNode(
-          X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
-          MVT::i32, MMO);
+      SDValue Operation =
+          DAG.getMemIntrinsicNode(X86ISD::CMPCCXADD, DL, Op->getVTList(),
+                                  {Chain, Addr, Src1, Src2, CC}, MVT::i32, MMO);
       return Operation;
     }
     case Intrinsic::x86_aadd32:
@@ -28301,8 +28370,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   }
 
   SDLoc dl(Op);
-  switch(IntrData->Type) {
-  default: llvm_unreachable("Unknown Intrinsic Type");
+  switch (IntrData->Type) {
+  default:
+    llvm_unreachable("Unknown Intrinsic Type");
   case RDSEED:
   case RDRAND: {
     // Emit the node with the right value type.
@@ -28323,32 +28393,32 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   }
   case GATHER_AVX2: {
     SDValue Chain = Op.getOperand(0);
-    SDValue Src   = Op.getOperand(2);
-    SDValue Base  = Op.getOperand(3);
+    SDValue Src = Op.getOperand(2);
+    SDValue Base = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
-    SDValue Mask  = Op.getOperand(5);
+    SDValue Mask = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
                              Scale, Chain, Subtarget);
   }
   case GATHER: {
-  //gather(v1, mask, index, base, scale);
+    // gather(v1, mask, index, base, scale);
     SDValue Chain = Op.getOperand(0);
-    SDValue Src   = Op.getOperand(2);
-    SDValue Base  = Op.getOperand(3);
+    SDValue Src = Op.getOperand(2);
+    SDValue Base = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
-    SDValue Mask  = Op.getOperand(5);
+    SDValue Mask = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
-    return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
-                         Chain, Subtarget);
+    return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+                         Subtarget);
   }
   case SCATTER: {
-  //scatter(base, mask, index, v1, scale);
+    // scatter(base, mask, index, v1, scale);
     SDValue Chain = Op.getOperand(0);
-    SDValue Base  = Op.getOperand(2);
-    SDValue Mask  = Op.getOperand(3);
+    SDValue Base = Op.getOperand(2);
+    SDValue Mask = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
-    SDValue Src   = Op.getOperand(5);
+    SDValue Src = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
                           Scale, Chain, Subtarget);
@@ -28359,9 +28429,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
            "Wrong prefetch hint in intrinsic: should be 2 or 3");
     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
     SDValue Chain = Op.getOperand(0);
-    SDValue Mask  = Op.getOperand(2);
+    SDValue Mask = Op.getOperand(2);
     SDValue Index = Op.getOperand(3);
-    SDValue Base  = Op.getOperand(4);
+    SDValue Base = Op.getOperand(4);
     SDValue Scale = Op.getOperand(5);
     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
                            Subtarget);
@@ -28396,8 +28466,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
 
     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
-    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
-                       Ret, SDValue(InTrans.getNode(), 1));
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret,
+                       SDValue(InTrans.getNode(), 1));
   }
   case TRUNCATE_TO_MEM_VI8:
   case TRUNCATE_TO_MEM_VI16:
@@ -28410,7 +28480,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
     assert(MemIntr && "Expected MemIntrinsicSDNode!");
 
-    EVT MemVT  = MemIntr->getMemoryVT();
+    EVT MemVT = MemIntr->getMemoryVT();
 
     uint16_t TruncationOp = IntrData->Opc0;
     switch (TruncationOp) {
@@ -28505,7 +28575,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
   Register FrameReg =
       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
-  SDLoc dl(Op);  // FIXME probably not meaningful
+  SDLoc dl(Op); // FIXME probably not meaningful
   unsigned Depth = Op.getConstantOperandVal(0);
   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
           (FrameReg == X86::EBP && VT == MVT::i32)) &&
@@ -28519,7 +28589,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
+Register X86TargetLowering::getRegisterByName(const char *RegName, LLT VT,
                                               const MachineFunction &MF) const {
   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
 
@@ -28576,10 +28646,10 @@ bool X86TargetLowering::needsFixedCatchObjects() const {
 }
 
 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain     = Op.getOperand(0);
-  SDValue Offset    = Op.getOperand(1);
-  SDValue Handler   = Op.getOperand(2);
-  SDLoc dl      (Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Offset = Op.getOperand(1);
+  SDValue Handler = Op.getOperand(2);
+  SDLoc dl(Op);
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -28590,9 +28660,9 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
   Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
 
-  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
-                                 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
-                                                       dl));
+  SDValue StoreAddr =
+      DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
+                  DAG.getIntPtrConstant(RegInfo->getSlotSize(), dl));
   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
@@ -28615,19 +28685,20 @@ SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
   }
   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
-                     DAG.getVTList(MVT::i32, MVT::Other),
-                     Op.getOperand(0), Op.getOperand(1));
+                     DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
+                     Op.getOperand(1));
 }
 
 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
-                     Op.getOperand(0), Op.getOperand(1));
+  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0),
+                     Op.getOperand(1));
 }
 
-SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
-                                                       SelectionDAG &DAG) const {
+SDValue
+X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
                      Op.getOperand(0));
@@ -28643,7 +28714,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
-  SDLoc dl (Op);
+  SDLoc dl(Op);
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -28652,7 +28723,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
     SDValue OutChains[6];
 
     // Large code-model.
-    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
+    const unsigned char JMP64r = 0xFF;  // 64-bit jmp through register opcode.
     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
 
     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
@@ -28700,7 +28771,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   } else {
     const Function *Func =
-      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+        cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
     CallingConv::ID CC = Func->getCallingConv();
     unsigned NestReg;
 
@@ -28722,7 +28793,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
         unsigned Idx = 0;
 
         for (FunctionType::param_iterator I = FTy->param_begin(),
-             E = FTy->param_end(); I != E; ++I, ++Idx)
+                                          E = FTy->param_end();
+             I != E; ++I, ++Idx)
           if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
             const DataLayout &DL = DAG.getDataLayout();
             // FIXME: should only count parameters that are lowered to integers.
@@ -28828,18 +28900,16 @@ SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
   Chain = CWD.getValue(1);
 
   // Mask and turn the control bits into a shift for the lookup table.
-  SDValue Shift =
-    DAG.getNode(ISD::SRL, DL, MVT::i16,
-                DAG.getNode(ISD::AND, DL, MVT::i16,
-                            CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
-                DAG.getConstant(9, DL, MVT::i8));
+  SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i16,
+                              DAG.getNode(ISD::AND, DL, MVT::i16, CWD,
+                                          DAG.getConstant(0xc00, DL, MVT::i16)),
+                              DAG.getConstant(9, DL, MVT::i8));
   Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
 
   SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
-  SDValue RetVal =
-    DAG.getNode(ISD::AND, DL, MVT::i32,
-                DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
-                DAG.getConstant(3, DL, MVT::i32));
+  SDValue RetVal = DAG.getNode(ISD::AND, DL, MVT::i32,
+                               DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
+                               DAG.getConstant(3, DL, MVT::i32));
 
   RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
 
@@ -29125,17 +29195,15 @@ static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElems = VT.getVectorNumElements();
 
-  assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
-          "Unsupported element type");
+  assert((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type");
 
   // Split vector, it's Lo and Hi parts will be handled in next iteration.
-  if (NumElems > 16 ||
-      (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
+  if (NumElems > 16 || (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
     return splitVectorIntUnary(Op, DAG, dl);
 
   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
-          "Unsupported value type for operation");
+         "Unsupported value type for operation");
 
   // Use native supported vector instruction vplzcntd.
   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
@@ -29807,10 +29875,10 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
       SmallVector<SDValue, 16> LoOps, HiOps;
       for (unsigned i = 0; i != NumElts; i += 16) {
         for (unsigned j = 0; j != 8; ++j) {
-          LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
-                                               MVT::i16));
-          HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
-                                               MVT::i16));
+          LoOps.push_back(
+              DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl, MVT::i16));
+          HiOps.push_back(
+              DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl, MVT::i16));
         }
       }
 
@@ -29851,7 +29919,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
 
     // Merge the two vectors back together with a shuffle. This expands into 2
     // shuffles.
-    static const int ShufMask[] = { 0, 4, 2, 6 };
+    static const int ShufMask[] = {0, 4, 2, 6};
     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
   }
 
@@ -30016,7 +30084,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
     //
     // Place the odd value at an even position (basically, shift all values 1
     // step to the left):
-    const int Mask[] = {1, -1,  3, -1,  5, -1,  7, -1,
+    const int Mask[] = {1, -1, 3,  -1, 5,  -1, 7,  -1,
                         9, -1, 11, -1, 13, -1, 15, -1};
     // <a|b|c|d> => <b|undef|d|undef>
     SDValue Odd0 =
@@ -30066,7 +30134,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
 
   // Only i8 vectors should need custom lowering after this.
   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
-         (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
+          (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
          "Unsupported vector type");
 
   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
@@ -30221,7 +30289,8 @@ static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
   return DAG.getMergeValues({Low, Ovf}, dl);
 }
 
-SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op,
+                                             SelectionDAG &DAG) const {
   assert(Subtarget.isTargetWin64() && "Unexpected target");
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
@@ -30236,13 +30305,13 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
   RTLIB::Libcall LC;
   bool isSigned;
   switch (Op->getOpcode()) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Unexpected request for libcall!");
   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
-  // clang-format on
+    // clang-format on
   }
 
   SDLoc dl(Op);
@@ -30381,9 +30450,9 @@ static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
 
 // The shift amount is a variable, but it is the same for all vector lanes.
 // These instructions are defined together with shift-immediate.
-static
-bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget,
-                                      unsigned Opcode) {
+static bool supportedVectorShiftWithBaseAmnt(EVT VT,
+                                             const X86Subtarget &Subtarget,
+                                             unsigned Opcode) {
   return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
 }
 
@@ -30412,7 +30481,7 @@ static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
     return true;
 
   bool LShift = VT.is128BitVector() || VT.is256BitVector();
-  bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
+  bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
   return (Opcode == ISD::SRA) ? AShift : LShift;
 }
 
@@ -32350,7 +32419,8 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   auto SSID = AI->getSyncScopeID();
   // We must restrict the ordering to avoid generating loads with Release or
   // ReleaseAcquire orderings.
-  auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
+  auto Order =
+      AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
 
   // Before the load we need a fence. Here is an example lifted from
   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
@@ -32419,31 +32489,28 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
 
   if (Subtarget.is64Bit()) {
     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
-    SDValue Ops[] = {
-      DAG.getRegister(X86::RSP, MVT::i64),                  // Base
-      DAG.getTargetConstant(1, DL, MVT::i8),                // Scale
-      DAG.getRegister(0, MVT::i64),                         // Index
-      DAG.getTargetConstant(SPOffset, DL, MVT::i32),        // Disp
-      DAG.getRegister(0, MVT::i16),                         // Segment.
-      Zero,
-      Chain};
-    SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
-                                     MVT::Other, Ops);
+    SDValue Ops[] = {DAG.getRegister(X86::RSP, MVT::i64),           // Base
+                     DAG.getTargetConstant(1, DL, MVT::i8),         // Scale
+                     DAG.getRegister(0, MVT::i64),                  // Index
+                     DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+                     DAG.getRegister(0, MVT::i16),                  // Segment.
+                     Zero,
+                     Chain};
+    SDNode *Res =
+        DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, MVT::Other, Ops);
     return SDValue(Res, 1);
   }
 
   SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
-  SDValue Ops[] = {
-    DAG.getRegister(X86::ESP, MVT::i32),            // Base
-    DAG.getTargetConstant(1, DL, MVT::i8),          // Scale
-    DAG.getRegister(0, MVT::i32),                   // Index
-    DAG.getTargetConstant(SPOffset, DL, MVT::i32),  // Disp
-    DAG.getRegister(0, MVT::i16),                   // Segment.
-    Zero,
-    Chain
-  };
-  SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
-                                   MVT::Other, Ops);
+  SDValue Ops[] = {DAG.getRegister(X86::ESP, MVT::i32),           // Base
+                   DAG.getTargetConstant(1, DL, MVT::i8),         // Scale
+                   DAG.getRegister(0, MVT::i32),                  // Index
+                   DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+                   DAG.getRegister(0, MVT::i16),                  // Segment.
+                   Zero,
+                   Chain};
+  SDNode *Res =
+      DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, MVT::Other, Ops);
   return SDValue(Res, 1);
 }
 
@@ -32476,36 +32543,44 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc DL(Op);
   unsigned Reg = 0;
   unsigned size = 0;
-  switch(T.SimpleTy) {
-  default: llvm_unreachable("Invalid value type!");
-  case MVT::i8:  Reg = X86::AL;  size = 1; break;
-  case MVT::i16: Reg = X86::AX;  size = 2; break;
-  case MVT::i32: Reg = X86::EAX; size = 4; break;
+  switch (T.SimpleTy) {
+  default:
+    llvm_unreachable("Invalid value type!");
+  case MVT::i8:
+    Reg = X86::AL;
+    size = 1;
+    break;
+  case MVT::i16:
+    Reg = X86::AX;
+    size = 2;
+    break;
+  case MVT::i32:
+    Reg = X86::EAX;
+    size = 4;
+    break;
   case MVT::i64:
     assert(Subtarget.is64Bit() && "Node not type legal!");
-    Reg = X86::RAX; size = 8;
+    Reg = X86::RAX;
+    size = 8;
     break;
   }
-  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
-                                  Op.getOperand(2), SDValue());
-  SDValue Ops[] = { cpIn.getValue(0),
-                    Op.getOperand(1),
-                    Op.getOperand(3),
-                    DAG.getTargetConstant(size, DL, MVT::i8),
-                    cpIn.getValue(1) };
+  SDValue cpIn =
+      DAG.getCopyToReg(Op.getOperand(0), DL, Reg, Op.getOperand(2), SDValue());
+  SDValue Ops[] = {cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3),
+                   DAG.getTargetConstant(size, DL, MVT::i8), cpIn.getValue(1)};
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
-  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
-                                           Ops, T, MMO);
+  SDValue Result =
+      DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO);
 
   SDValue cpOut =
-    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
+      DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
                                       MVT::i32, cpOut.getValue(2));
   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
 
-  return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
-                     cpOut, Success, EFLAGS.getValue(1));
+  return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), cpOut, Success,
+                     EFLAGS.getValue(1));
 }
 
 // Create MOVMSKB, taking into account whether we need to split for AVX1.
@@ -32567,7 +32642,8 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
   }
 
   assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
-          SrcVT == MVT::i64) && "Unexpected VT!");
+          SrcVT == MVT::i64) &&
+         "Unexpected VT!");
 
   assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
   if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
@@ -32581,8 +32657,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
     // Example: from MVT::v2i32 to MVT::v4i32.
     MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
                                  SrcVT.getVectorNumElements() * 2);
-    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
-                      DAG.getUNDEF(SrcVT));
+    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, DAG.getUNDEF(SrcVT));
   } else {
     assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
            "Unexpected source type in LowerBITCAST");
@@ -32728,7 +32803,8 @@ static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL,
   if (Subtarget.hasVPOPCNTDQ()) {
     unsigned NumElems = VT.getVectorNumElements();
     assert((VT.getVectorElementType() == MVT::i8 ||
-            VT.getVectorElementType() == MVT::i16) && "Unexpected type");
+            VT.getVectorElementType() == MVT::i16) &&
+           "Unexpected type");
     if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
       MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
@@ -33127,16 +33203,16 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
     SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
     assert(!N->hasAnyUseOfValue(0));
     // NOTE: The getUNDEF is needed to give something for the unused result 0.
-    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
-                       DAG.getUNDEF(VT), NewChain);
+    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT),
+                       NewChain);
   }
 
   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
   // RAUW the chain, but don't worry about the result, as it's unused.
   assert(!N->hasAnyUseOfValue(0));
   // NOTE: The getUNDEF is needed to give something for the unused result 0.
-  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
-                     DAG.getUNDEF(VT), LockOp.getValue(1));
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT),
+                     LockOp.getValue(1));
 }
 
 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
@@ -33236,17 +33312,17 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
   // Set the carry flag.
   SDValue Carry = Op.getOperand(2);
   EVT CarryVT = Carry.getValueType();
-  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
-                      Carry, DAG.getAllOnesConstant(DL, CarryVT));
+  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry,
+                      DAG.getAllOnesConstant(DL, CarryVT));
 
   bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
-  SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
-                            Op.getOperand(0), Op.getOperand(1),
-                            Carry.getValue(1));
+  SDValue Sum =
+      DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs, Op.getOperand(0),
+                  Op.getOperand(1), Carry.getValue(1));
 
   bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
-  SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
-                           Sum.getValue(1), DL, DAG);
+  SDValue SetCC =
+      getSETCC(IsSigned ? X86::COND_O : X86::COND_B, Sum.getValue(1), DL, DAG);
   if (N->getValueType(1) == MVT::i1)
     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
 
@@ -33397,8 +33473,8 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
       !Index.getSimpleValueType().is512BitVector()) {
     // Determine how much we need to widen by to get a 512-bit type.
-    unsigned Factor = std::min(512/VT.getSizeInBits(),
-                               512/IndexVT.getSizeInBits());
+    unsigned Factor =
+        std::min(512 / VT.getSizeInBits(), 512 / IndexVT.getSizeInBits());
     unsigned NumElts = VT.getVectorNumElements() * Factor;
 
     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
@@ -33440,7 +33516,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
         N->isExpandingLoad());
     // Emit a blend.
     SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
-    return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
+    return DAG.getMergeValues({Select, NewLoad.getValue(1)}, dl);
   }
 
   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
@@ -33507,7 +33583,7 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
 
   // This operation is legal for targets with VLX, but without
   // VLX the vector should be widened to 512 bit
-  unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+  unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
 
   // Mask element has to be i1.
@@ -33549,8 +33625,8 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
   if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
       !IndexVT.is512BitVector()) {
     // Determine how much we need to widen by to get a 512-bit type.
-    unsigned Factor = std::min(512/VT.getSizeInBits(),
-                               512/IndexVT.getSizeInBits());
+    unsigned Factor =
+        std::min(512 / VT.getSizeInBits(), 512 / IndexVT.getSizeInBits());
 
     unsigned NumElts = VT.getVectorNumElements() * Factor;
 
@@ -33567,8 +33643,8 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
   if (PassThru.isUndef())
     PassThru = getZeroVector(VT, Subtarget, DAG, dl);
 
-  SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
-                    N->getScale() };
+  SDValue Ops[] = {N->getChain(),   PassThru, Mask,
+                   N->getBasePtr(), Index,    N->getScale()};
   SDValue NewGather = DAG.getMemIntrinsicNode(
       X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
       N->getMemOperand());
@@ -33766,7 +33842,7 @@ SDValue X86TargetLowering::visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL,
 /// Provide custom lowering hooks for some operations.
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Should not custom lower this!");
   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
@@ -33923,7 +33999,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
   case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
   case ISD::PREFETCH:           return LowerPREFETCH(Op, Subtarget, DAG);
-  // clang-format on
+    // clang-format on
   }
 }
 
@@ -33936,7 +34012,7 @@ bool X86TargetLowering::isSelectSupported(SelectSupportKind Kind) const {
 /// Replace a node with an illegal result type with a new node built out of
 /// custom code.
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
-                                           SmallVectorImpl<SDValue>&Results,
+                                           SmallVectorImpl<SDValue> &Results,
                                            SelectionDAG &DAG) const {
   SDLoc dl(N);
   unsigned Opc = N->getOpcode();
@@ -34062,8 +34138,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
 
     // Widen the result with by padding with undef.
-    Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
-                      DAG.getUNDEF(VT));
+    Res =
+        DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, DAG.getUNDEF(VT));
     Results.push_back(Res);
     Results.push_back(Ovf);
     return;
@@ -34080,11 +34156,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
            "Unexpected type action!");
     unsigned NumConcat = 128 / InVT.getSizeInBits();
 
-    EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
-                                    InVT.getVectorElementType(),
-                                    NumConcat * InVT.getVectorNumElements());
-    EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
-                                  VT.getVectorElementType(),
+    EVT InWideVT =
+        EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
+                         NumConcat * InVT.getVectorNumElements());
+    EVT WideVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
                                   NumConcat * VT.getVectorNumElements());
 
     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
@@ -34148,7 +34223,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
-    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+    SDValue V = LowerWin64_i128OP(SDValue(N, 0), DAG);
     Results.push_back(V);
     return;
   }
@@ -34226,9 +34301,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
 
       Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
       Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
-      SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
-                                         { 0,  1,  2,  3, 16, 17, 18, 19,
-                                          -1, -1, -1, -1, -1, -1, -1, -1 });
+      SDValue Res = DAG.getVectorShuffle(
+          MVT::v16i8, dl, Lo, Hi,
+          {0, 1, 2, 3, 16, 17, 18, 19, -1, -1, -1, -1, -1, -1, -1, -1});
       Results.push_back(Res);
       return;
     }
@@ -34260,7 +34335,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue In = N->getOperand(0);
     EVT InVT = In.getValueType();
     if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
-        (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
+        (InVT == MVT::v4i16 || InVT == MVT::v4i8)) {
       assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
              "Unexpected type action!");
       assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
@@ -34276,11 +34351,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
 
       // Create an unpackl and unpackh to interleave the sign bits then bitcast
       // to v2i64.
-      SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
-                                        {0, 4, 1, 5});
+      SDValue Lo =
+          DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, {0, 4, 1, 5});
       Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
-      SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
-                                        {2, 6, 3, 7});
+      SDValue Hi =
+          DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, {2, 6, 3, 7});
       Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
 
       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
@@ -34467,7 +34542,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
-
     if (VT == MVT::v2i32) {
       assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
              "Strict unsigned conversion requires AVX512");
@@ -34552,9 +34626,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       }
 
       SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
-      SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
-                                DAG.getConstantFP(0.0, dl, VecInVT), Src,
-                                ZeroIdx);
+      SDValue Res =
+          DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
+                      DAG.getConstantFP(0.0, dl, VecInVT), Src, ZeroIdx);
       SDValue Chain;
       if (IsStrict) {
         SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
@@ -34641,8 +34715,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT SrcVT = Src.getValueType();
     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
       if (IsStrict) {
-        unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
-                                : X86ISD::STRICT_CVTUI2P;
+        unsigned Opc =
+            IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
         SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
                                   {N->getOperand(0), Src});
         Results.push_back(Res);
@@ -34656,7 +34730,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
         Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
       SDValue Zero = DAG.getConstant(0, dl, SrcVT);
-      SDValue One  = DAG.getConstant(1, dl, SrcVT);
+      SDValue One = DAG.getConstant(1, dl, SrcVT);
       SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
                                  DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
                                  DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
@@ -34722,9 +34796,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     if (IsStrict) {
       SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
                                 {N->getOperand(0), Or, VBias});
-      SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
-                                {MVT::v4f32, MVT::Other},
-                                {Sub.getValue(1), Sub});
+      SDValue Res =
+          DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
+                      {Sub.getValue(1), Sub});
       Results.push_back(Res);
       Results.push_back(Res.getValue(1));
     } else {
@@ -34805,8 +34879,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = N->getConstantOperandVal(1);
     switch (IntNo) {
-    default : llvm_unreachable("Do not know how to custom type "
-                               "legalize this intrinsic operation!");
+    default:
+      llvm_unreachable("Do not know how to custom type "
+                       "legalize this intrinsic operation!");
     case Intrinsic::x86_rdtsc:
       return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
                                      Results);
@@ -34819,7 +34894,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     case Intrinsic::x86_rdpru:
       expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
-        Results);
+                                  Results);
       return;
     case Intrinsic::x86_xgetbv:
       expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
@@ -34876,12 +34951,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
 
     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
-                                        Regs64bit ? X86::RAX : X86::EAX,
-                                        HalfT, Result.getValue(1));
+                                        Regs64bit ? X86::RAX : X86::EAX, HalfT,
+                                        Result.getValue(1));
     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
-                                        Regs64bit ? X86::RDX : X86::EDX,
-                                        HalfT, cpOutL.getValue(2));
-    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+                                        Regs64bit ? X86::RDX : X86::EDX, HalfT,
+                                        cpOutL.getValue(2));
+    SDValue OpsF[] = {cpOutL.getValue(0), cpOutH.getValue(0)};
 
     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
                                         MVT::i32, cpOutH.getValue(2));
@@ -34923,7 +34998,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         // Then extract the lower 64-bits.
         MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
         SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
-        SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
+        SDValue Ops[] = {Node->getChain(), Node->getBasePtr()};
         SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
                                              MVT::i64, Node->getMemOperand());
         if (Subtarget.hasSSE2()) {
@@ -34947,10 +35022,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         // First load this into an 80-bit X87 register. This will put the whole
         // integer into the significand.
         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
-        SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
-        SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
-                                                 dl, Tys, Ops, MVT::i64,
-                                                 Node->getMemOperand());
+        SDValue Ops[] = {Node->getChain(), Node->getBasePtr()};
+        SDValue Result = DAG.getMemIntrinsicNode(
+            X86ISD::FILD, dl, Tys, Ops, MVT::i64, Node->getMemOperand());
         SDValue Chain = Result.getValue(1);
 
         // Now store the X87 register to a stack temporary and convert to i64.
@@ -34961,7 +35035,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
         MachinePointerInfo MPI =
             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
-        SDValue StoreOps[] = { Chain, Result, StackPtr };
+        SDValue StoreOps[] = {Chain, Result, StackPtr};
         Chain = DAG.getMemIntrinsicNode(
             X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
             MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
@@ -35019,8 +35093,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
              "Unexpected type action!");
       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
-      SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
-                                N->getOperand(0));
+      SDValue Res =
+          DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, N->getOperand(0));
       Res = DAG.getBitcast(WideVT, Res);
       Results.push_back(Res);
       return;
@@ -35042,8 +35116,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       SDValue Mask = Gather->getMask();
       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
       SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
-                                     Gather->getPassThru(),
-                                     DAG.getUNDEF(VT));
+                                     Gather->getPassThru(), DAG.getUNDEF(VT));
       if (!Subtarget.hasVLX()) {
         // We need to widen the mask, but the instruction will only use 2
         // of its elements. So we can use undef.
@@ -35051,8 +35124,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                            DAG.getUNDEF(MVT::v2i1));
         Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
       }
-      SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
-                        Gather->getBasePtr(), Index, Gather->getScale() };
+      SDValue Ops[] = {Gather->getChain(),   PassThru, Mask,
+                       Gather->getBasePtr(), Index,    Gather->getScale()};
       SDValue Res = DAG.getMemIntrinsicNode(
           X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
           Gather->getMemoryVT(), Gather->getMemOperand());
@@ -35097,7 +35170,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::ADDRSPACECAST: {
-    SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
+    SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
     Results.push_back(V);
     return;
   }
@@ -35128,470 +35201,473 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
 
 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((X86ISD::NodeType)Opcode) {
-  case X86ISD::FIRST_NUMBER:       break;
-#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
-  NODE_NAME_CASE(BSF)
-  NODE_NAME_CASE(BSR)
-  NODE_NAME_CASE(FSHL)
-  NODE_NAME_CASE(FSHR)
-  NODE_NAME_CASE(FAND)
-  NODE_NAME_CASE(FANDN)
-  NODE_NAME_CASE(FOR)
-  NODE_NAME_CASE(FXOR)
-  NODE_NAME_CASE(FILD)
-  NODE_NAME_CASE(FIST)
-  NODE_NAME_CASE(FP_TO_INT_IN_MEM)
-  NODE_NAME_CASE(FLD)
-  NODE_NAME_CASE(FST)
-  NODE_NAME_CASE(CALL)
-  NODE_NAME_CASE(CALL_RVMARKER)
-  NODE_NAME_CASE(IMP_CALL)
-  NODE_NAME_CASE(BT)
-  NODE_NAME_CASE(CMP)
-  NODE_NAME_CASE(FCMP)
-  NODE_NAME_CASE(STRICT_FCMP)
-  NODE_NAME_CASE(STRICT_FCMPS)
-  NODE_NAME_CASE(COMI)
-  NODE_NAME_CASE(UCOMI)
-  NODE_NAME_CASE(COMX)
-  NODE_NAME_CASE(UCOMX)
-  NODE_NAME_CASE(CMPM)
-  NODE_NAME_CASE(CMPMM)
-  NODE_NAME_CASE(STRICT_CMPM)
-  NODE_NAME_CASE(CMPMM_SAE)
-  NODE_NAME_CASE(SETCC)
-  NODE_NAME_CASE(CTSELECT)
-  NODE_NAME_CASE(SETCC_CARRY)
-  NODE_NAME_CASE(FSETCC)
-  NODE_NAME_CASE(FSETCCM)
-  NODE_NAME_CASE(FSETCCM_SAE)
-  NODE_NAME_CASE(CMOV)
-  NODE_NAME_CASE(BRCOND)
-  NODE_NAME_CASE(RET_GLUE)
-  NODE_NAME_CASE(IRET)
-  NODE_NAME_CASE(REP_STOS)
-  NODE_NAME_CASE(REP_MOVS)
-  NODE_NAME_CASE(GlobalBaseReg)
-  NODE_NAME_CASE(Wrapper)
-  NODE_NAME_CASE(WrapperRIP)
-  NODE_NAME_CASE(MOVQ2DQ)
-  NODE_NAME_CASE(MOVDQ2Q)
-  NODE_NAME_CASE(MMX_MOVD2W)
-  NODE_NAME_CASE(MMX_MOVW2D)
-  NODE_NAME_CASE(PEXTRB)
-  NODE_NAME_CASE(PEXTRW)
-  NODE_NAME_CASE(INSERTPS)
-  NODE_NAME_CASE(PINSRB)
-  NODE_NAME_CASE(PINSRW)
-  NODE_NAME_CASE(PSHUFB)
-  NODE_NAME_CASE(ANDNP)
-  NODE_NAME_CASE(BLENDI)
-  NODE_NAME_CASE(BLENDV)
-  NODE_NAME_CASE(HADD)
-  NODE_NAME_CASE(HSUB)
-  NODE_NAME_CASE(FHADD)
-  NODE_NAME_CASE(FHSUB)
-  NODE_NAME_CASE(CONFLICT)
-  NODE_NAME_CASE(FMAX)
-  NODE_NAME_CASE(FMAXS)
-  NODE_NAME_CASE(FMAX_SAE)
-  NODE_NAME_CASE(FMAXS_SAE)
-  NODE_NAME_CASE(STRICT_FMAX)
-  NODE_NAME_CASE(FMIN)
-  NODE_NAME_CASE(FMINS)
-  NODE_NAME_CASE(FMIN_SAE)
-  NODE_NAME_CASE(FMINS_SAE)
-  NODE_NAME_CASE(STRICT_FMIN)
-  NODE_NAME_CASE(FMAXC)
-  NODE_NAME_CASE(FMINC)
-  NODE_NAME_CASE(FRSQRT)
-  NODE_NAME_CASE(FRCP)
-  NODE_NAME_CASE(EXTRQI)
-  NODE_NAME_CASE(INSERTQI)
-  NODE_NAME_CASE(TLSADDR)
-  NODE_NAME_CASE(TLSBASEADDR)
-  NODE_NAME_CASE(TLSCALL)
-  NODE_NAME_CASE(TLSDESC)
-  NODE_NAME_CASE(EH_SJLJ_SETJMP)
-  NODE_NAME_CASE(EH_SJLJ_LONGJMP)
-  NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
-  NODE_NAME_CASE(EH_RETURN)
-  NODE_NAME_CASE(TC_RETURN)
-  NODE_NAME_CASE(FNSTCW16m)
-  NODE_NAME_CASE(FLDCW16m)
-  NODE_NAME_CASE(FNSTENVm)
-  NODE_NAME_CASE(FLDENVm)
-  NODE_NAME_CASE(LCMPXCHG_DAG)
-  NODE_NAME_CASE(LCMPXCHG8_DAG)
-  NODE_NAME_CASE(LCMPXCHG16_DAG)
-  NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
-  NODE_NAME_CASE(LADD)
-  NODE_NAME_CASE(LSUB)
-  NODE_NAME_CASE(LOR)
-  NODE_NAME_CASE(LXOR)
-  NODE_NAME_CASE(LAND)
-  NODE_NAME_CASE(LBTS)
-  NODE_NAME_CASE(LBTC)
-  NODE_NAME_CASE(LBTR)
-  NODE_NAME_CASE(LBTS_RM)
-  NODE_NAME_CASE(LBTC_RM)
-  NODE_NAME_CASE(LBTR_RM)
-  NODE_NAME_CASE(AADD)
-  NODE_NAME_CASE(AOR)
-  NODE_NAME_CASE(AXOR)
-  NODE_NAME_CASE(AAND)
-  NODE_NAME_CASE(VZEXT_MOVL)
-  NODE_NAME_CASE(VZEXT_LOAD)
-  NODE_NAME_CASE(VEXTRACT_STORE)
-  NODE_NAME_CASE(VTRUNC)
-  NODE_NAME_CASE(VTRUNCS)
-  NODE_NAME_CASE(VTRUNCUS)
-  NODE_NAME_CASE(VMTRUNC)
-  NODE_NAME_CASE(VMTRUNCS)
-  NODE_NAME_CASE(VMTRUNCUS)
-  NODE_NAME_CASE(VTRUNCSTORES)
-  NODE_NAME_CASE(VTRUNCSTOREUS)
-  NODE_NAME_CASE(VMTRUNCSTORES)
-  NODE_NAME_CASE(VMTRUNCSTOREUS)
-  NODE_NAME_CASE(VFPEXT)
-  NODE_NAME_CASE(STRICT_VFPEXT)
-  NODE_NAME_CASE(VFPEXT_SAE)
-  NODE_NAME_CASE(VFPEXTS)
-  NODE_NAME_CASE(VFPEXTS_SAE)
-  NODE_NAME_CASE(VFPROUND)
-  NODE_NAME_CASE(VFPROUND2)
-  NODE_NAME_CASE(VFPROUND2_RND)
-  NODE_NAME_CASE(STRICT_VFPROUND)
-  NODE_NAME_CASE(VMFPROUND)
-  NODE_NAME_CASE(VFPROUND_RND)
-  NODE_NAME_CASE(VFPROUNDS)
-  NODE_NAME_CASE(VFPROUNDS_RND)
-  NODE_NAME_CASE(VSHLDQ)
-  NODE_NAME_CASE(VSRLDQ)
-  NODE_NAME_CASE(VSHL)
-  NODE_NAME_CASE(VSRL)
-  NODE_NAME_CASE(VSRA)
-  NODE_NAME_CASE(VSHLI)
-  NODE_NAME_CASE(VSRLI)
-  NODE_NAME_CASE(VSRAI)
-  NODE_NAME_CASE(VSHLV)
-  NODE_NAME_CASE(VSRLV)
-  NODE_NAME_CASE(VSRAV)
-  NODE_NAME_CASE(VROTLI)
-  NODE_NAME_CASE(VROTRI)
-  NODE_NAME_CASE(VPPERM)
-  NODE_NAME_CASE(CMPP)
-  NODE_NAME_CASE(STRICT_CMPP)
-  NODE_NAME_CASE(PCMPEQ)
-  NODE_NAME_CASE(PCMPGT)
-  NODE_NAME_CASE(PHMINPOS)
-  NODE_NAME_CASE(ADD)
-  NODE_NAME_CASE(SUB)
-  NODE_NAME_CASE(ADC)
-  NODE_NAME_CASE(SBB)
-  NODE_NAME_CASE(SMUL)
-  NODE_NAME_CASE(UMUL)
-  NODE_NAME_CASE(OR)
-  NODE_NAME_CASE(XOR)
-  NODE_NAME_CASE(AND)
-  NODE_NAME_CASE(BEXTR)
-  NODE_NAME_CASE(BEXTRI)
-  NODE_NAME_CASE(BZHI)
-  NODE_NAME_CASE(PDEP)
-  NODE_NAME_CASE(PEXT)
-  NODE_NAME_CASE(MUL_IMM)
-  NODE_NAME_CASE(MOVMSK)
-  NODE_NAME_CASE(PTEST)
-  NODE_NAME_CASE(TESTP)
-  NODE_NAME_CASE(KORTEST)
-  NODE_NAME_CASE(KTEST)
-  NODE_NAME_CASE(KADD)
-  NODE_NAME_CASE(KSHIFTL)
-  NODE_NAME_CASE(KSHIFTR)
-  NODE_NAME_CASE(PACKSS)
-  NODE_NAME_CASE(PACKUS)
-  NODE_NAME_CASE(PALIGNR)
-  NODE_NAME_CASE(VALIGN)
-  NODE_NAME_CASE(VSHLD)
-  NODE_NAME_CASE(VSHRD)
-  NODE_NAME_CASE(PSHUFD)
-  NODE_NAME_CASE(PSHUFHW)
-  NODE_NAME_CASE(PSHUFLW)
-  NODE_NAME_CASE(SHUFP)
-  NODE_NAME_CASE(SHUF128)
-  NODE_NAME_CASE(MOVLHPS)
-  NODE_NAME_CASE(MOVHLPS)
-  NODE_NAME_CASE(MOVDDUP)
-  NODE_NAME_CASE(MOVSHDUP)
-  NODE_NAME_CASE(MOVSLDUP)
-  NODE_NAME_CASE(MOVSD)
-  NODE_NAME_CASE(MOVSS)
-  NODE_NAME_CASE(MOVSH)
-  NODE_NAME_CASE(UNPCKL)
-  NODE_NAME_CASE(UNPCKH)
-  NODE_NAME_CASE(VBROADCAST)
-  NODE_NAME_CASE(VBROADCAST_LOAD)
-  NODE_NAME_CASE(VBROADCASTM)
-  NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
-  NODE_NAME_CASE(VPERMILPV)
-  NODE_NAME_CASE(VPERMILPI)
-  NODE_NAME_CASE(VPERM2X128)
-  NODE_NAME_CASE(VPERMV)
-  NODE_NAME_CASE(VPERMV3)
-  NODE_NAME_CASE(VPERMI)
-  NODE_NAME_CASE(VPTERNLOG)
-  NODE_NAME_CASE(FP_TO_SINT_SAT)
-  NODE_NAME_CASE(FP_TO_UINT_SAT)
-  NODE_NAME_CASE(VFIXUPIMM)
-  NODE_NAME_CASE(VFIXUPIMM_SAE)
-  NODE_NAME_CASE(VFIXUPIMMS)
-  NODE_NAME_CASE(VFIXUPIMMS_SAE)
-  NODE_NAME_CASE(VRANGE)
-  NODE_NAME_CASE(VRANGE_SAE)
-  NODE_NAME_CASE(VRANGES)
-  NODE_NAME_CASE(VRANGES_SAE)
-  NODE_NAME_CASE(PMULUDQ)
-  NODE_NAME_CASE(PMULDQ)
-  NODE_NAME_CASE(PSADBW)
-  NODE_NAME_CASE(DBPSADBW)
-  NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
-  NODE_NAME_CASE(VAARG_64)
-  NODE_NAME_CASE(VAARG_X32)
-  NODE_NAME_CASE(DYN_ALLOCA)
-  NODE_NAME_CASE(MFENCE)
-  NODE_NAME_CASE(SEG_ALLOCA)
-  NODE_NAME_CASE(PROBED_ALLOCA)
-  NODE_NAME_CASE(RDRAND)
-  NODE_NAME_CASE(RDSEED)
-  NODE_NAME_CASE(RDPKRU)
-  NODE_NAME_CASE(WRPKRU)
-  NODE_NAME_CASE(VPMADDUBSW)
-  NODE_NAME_CASE(VPMADDWD)
-  NODE_NAME_CASE(VPSHA)
-  NODE_NAME_CASE(VPSHL)
-  NODE_NAME_CASE(VPCOM)
-  NODE_NAME_CASE(VPCOMU)
-  NODE_NAME_CASE(VPERMIL2)
-  NODE_NAME_CASE(FMSUB)
-  NODE_NAME_CASE(STRICT_FMSUB)
-  NODE_NAME_CASE(FNMADD)
-  NODE_NAME_CASE(STRICT_FNMADD)
-  NODE_NAME_CASE(FNMSUB)
-  NODE_NAME_CASE(STRICT_FNMSUB)
-  NODE_NAME_CASE(FMADDSUB)
-  NODE_NAME_CASE(FMSUBADD)
-  NODE_NAME_CASE(FMADD_RND)
-  NODE_NAME_CASE(FNMADD_RND)
-  NODE_NAME_CASE(FMSUB_RND)
-  NODE_NAME_CASE(FNMSUB_RND)
-  NODE_NAME_CASE(FMADDSUB_RND)
-  NODE_NAME_CASE(FMSUBADD_RND)
-  NODE_NAME_CASE(VFMADDC)
-  NODE_NAME_CASE(VFMADDC_RND)
-  NODE_NAME_CASE(VFCMADDC)
-  NODE_NAME_CASE(VFCMADDC_RND)
-  NODE_NAME_CASE(VFMULC)
-  NODE_NAME_CASE(VFMULC_RND)
-  NODE_NAME_CASE(VFCMULC)
-  NODE_NAME_CASE(VFCMULC_RND)
-  NODE_NAME_CASE(VFMULCSH)
-  NODE_NAME_CASE(VFMULCSH_RND)
-  NODE_NAME_CASE(VFCMULCSH)
-  NODE_NAME_CASE(VFCMULCSH_RND)
-  NODE_NAME_CASE(VFMADDCSH)
-  NODE_NAME_CASE(VFMADDCSH_RND)
-  NODE_NAME_CASE(VFCMADDCSH)
-  NODE_NAME_CASE(VFCMADDCSH_RND)
-  NODE_NAME_CASE(VPMADD52H)
-  NODE_NAME_CASE(VPMADD52L)
-  NODE_NAME_CASE(VRNDSCALE)
-  NODE_NAME_CASE(STRICT_VRNDSCALE)
-  NODE_NAME_CASE(VRNDSCALE_SAE)
-  NODE_NAME_CASE(VRNDSCALES)
-  NODE_NAME_CASE(VRNDSCALES_SAE)
-  NODE_NAME_CASE(VREDUCE)
-  NODE_NAME_CASE(VREDUCE_SAE)
-  NODE_NAME_CASE(VREDUCES)
-  NODE_NAME_CASE(VREDUCES_SAE)
-  NODE_NAME_CASE(VGETMANT)
-  NODE_NAME_CASE(VGETMANT_SAE)
-  NODE_NAME_CASE(VGETMANTS)
-  NODE_NAME_CASE(VGETMANTS_SAE)
-  NODE_NAME_CASE(PCMPESTR)
-  NODE_NAME_CASE(PCMPISTR)
-  NODE_NAME_CASE(XTEST)
-  NODE_NAME_CASE(COMPRESS)
-  NODE_NAME_CASE(EXPAND)
-  NODE_NAME_CASE(SELECTS)
-  NODE_NAME_CASE(ADDSUB)
-  NODE_NAME_CASE(RCP14)
-  NODE_NAME_CASE(RCP14S)
-  NODE_NAME_CASE(RSQRT14)
-  NODE_NAME_CASE(RSQRT14S)
-  NODE_NAME_CASE(FADD_RND)
-  NODE_NAME_CASE(FADDS)
-  NODE_NAME_CASE(FADDS_RND)
-  NODE_NAME_CASE(FSUB_RND)
-  NODE_NAME_CASE(FSUBS)
-  NODE_NAME_CASE(FSUBS_RND)
-  NODE_NAME_CASE(FMUL_RND)
-  NODE_NAME_CASE(FMULS)
-  NODE_NAME_CASE(FMULS_RND)
-  NODE_NAME_CASE(FDIV_RND)
-  NODE_NAME_CASE(FDIVS)
-  NODE_NAME_CASE(FDIVS_RND)
-  NODE_NAME_CASE(FSQRT_RND)
-  NODE_NAME_CASE(FSQRTS)
-  NODE_NAME_CASE(FSQRTS_RND)
-  NODE_NAME_CASE(FGETEXP)
-  NODE_NAME_CASE(FGETEXP_SAE)
-  NODE_NAME_CASE(FGETEXPS)
-  NODE_NAME_CASE(FGETEXPS_SAE)
-  NODE_NAME_CASE(SCALEF)
-  NODE_NAME_CASE(SCALEF_RND)
-  NODE_NAME_CASE(SCALEFS)
-  NODE_NAME_CASE(SCALEFS_RND)
-  NODE_NAME_CASE(MULHRS)
-  NODE_NAME_CASE(SINT_TO_FP_RND)
-  NODE_NAME_CASE(UINT_TO_FP_RND)
-  NODE_NAME_CASE(CVTTP2SI)
-  NODE_NAME_CASE(CVTTP2UI)
-  NODE_NAME_CASE(STRICT_CVTTP2SI)
-  NODE_NAME_CASE(STRICT_CVTTP2UI)
-  NODE_NAME_CASE(MCVTTP2SI)
-  NODE_NAME_CASE(MCVTTP2UI)
-  NODE_NAME_CASE(CVTTP2SI_SAE)
-  NODE_NAME_CASE(CVTTP2UI_SAE)
-  NODE_NAME_CASE(CVTTS2SI)
-  NODE_NAME_CASE(CVTTS2UI)
-  NODE_NAME_CASE(CVTTS2SI_SAE)
-  NODE_NAME_CASE(CVTTS2UI_SAE)
-  NODE_NAME_CASE(CVTSI2P)
-  NODE_NAME_CASE(CVTUI2P)
-  NODE_NAME_CASE(STRICT_CVTSI2P)
-  NODE_NAME_CASE(STRICT_CVTUI2P)
-  NODE_NAME_CASE(MCVTSI2P)
-  NODE_NAME_CASE(MCVTUI2P)
-  NODE_NAME_CASE(VFPCLASS)
-  NODE_NAME_CASE(VFPCLASSS)
-  NODE_NAME_CASE(MULTISHIFT)
-  NODE_NAME_CASE(SCALAR_SINT_TO_FP)
-  NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
-  NODE_NAME_CASE(SCALAR_UINT_TO_FP)
-  NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
-  NODE_NAME_CASE(CVTPS2PH)
-  NODE_NAME_CASE(STRICT_CVTPS2PH)
-  NODE_NAME_CASE(CVTPS2PH_SAE)
-  NODE_NAME_CASE(MCVTPS2PH)
-  NODE_NAME_CASE(MCVTPS2PH_SAE)
-  NODE_NAME_CASE(CVTPH2PS)
-  NODE_NAME_CASE(STRICT_CVTPH2PS)
-  NODE_NAME_CASE(CVTPH2PS_SAE)
-  NODE_NAME_CASE(CVTP2SI)
-  NODE_NAME_CASE(CVTP2UI)
-  NODE_NAME_CASE(MCVTP2SI)
-  NODE_NAME_CASE(MCVTP2UI)
-  NODE_NAME_CASE(CVTP2SI_RND)
-  NODE_NAME_CASE(CVTP2UI_RND)
-  NODE_NAME_CASE(CVTS2SI)
-  NODE_NAME_CASE(CVTS2UI)
-  NODE_NAME_CASE(CVTS2SI_RND)
-  NODE_NAME_CASE(CVTS2UI_RND)
-  NODE_NAME_CASE(CVTNEPS2BF16)
-  NODE_NAME_CASE(MCVTNEPS2BF16)
-  NODE_NAME_CASE(DPBF16PS)
-  NODE_NAME_CASE(DPFP16PS)
-  NODE_NAME_CASE(MPSADBW)
-  NODE_NAME_CASE(LWPINS)
-  NODE_NAME_CASE(MGATHER)
-  NODE_NAME_CASE(MSCATTER)
-  NODE_NAME_CASE(VPDPBUSD)
-  NODE_NAME_CASE(VPDPBUSDS)
-  NODE_NAME_CASE(VPDPWSSD)
-  NODE_NAME_CASE(VPDPWSSDS)
-  NODE_NAME_CASE(VPSHUFBITQMB)
-  NODE_NAME_CASE(GF2P8MULB)
-  NODE_NAME_CASE(GF2P8AFFINEQB)
-  NODE_NAME_CASE(GF2P8AFFINEINVQB)
-  NODE_NAME_CASE(NT_CALL)
-  NODE_NAME_CASE(NT_BRIND)
-  NODE_NAME_CASE(UMWAIT)
-  NODE_NAME_CASE(TPAUSE)
-  NODE_NAME_CASE(ENQCMD)
-  NODE_NAME_CASE(ENQCMDS)
-  NODE_NAME_CASE(VP2INTERSECT)
-  NODE_NAME_CASE(VPDPBSUD)
-  NODE_NAME_CASE(VPDPBSUDS)
-  NODE_NAME_CASE(VPDPBUUD)
-  NODE_NAME_CASE(VPDPBUUDS)
-  NODE_NAME_CASE(VPDPBSSD)
-  NODE_NAME_CASE(VPDPBSSDS)
-  NODE_NAME_CASE(VPDPWSUD)
-  NODE_NAME_CASE(VPDPWSUDS)
-  NODE_NAME_CASE(VPDPWUSD)
-  NODE_NAME_CASE(VPDPWUSDS)
-  NODE_NAME_CASE(VPDPWUUD)
-  NODE_NAME_CASE(VPDPWUUDS)
-  NODE_NAME_CASE(VMINMAX)
-  NODE_NAME_CASE(VMINMAX_SAE)
-  NODE_NAME_CASE(VMINMAXS)
-  NODE_NAME_CASE(VMINMAXS_SAE)
-  NODE_NAME_CASE(CVTP2IBS)
-  NODE_NAME_CASE(CVTP2IUBS)
-  NODE_NAME_CASE(CVTP2IBS_RND)
-  NODE_NAME_CASE(CVTP2IUBS_RND)
-  NODE_NAME_CASE(CVTTP2IBS)
-  NODE_NAME_CASE(CVTTP2IUBS)
-  NODE_NAME_CASE(CVTTP2IBS_SAE)
-  NODE_NAME_CASE(CVTTP2IUBS_SAE)
-  NODE_NAME_CASE(VCVT2PH2BF8)
-  NODE_NAME_CASE(VCVT2PH2BF8S)
-  NODE_NAME_CASE(VCVT2PH2HF8)
-  NODE_NAME_CASE(VCVT2PH2HF8S)
-  NODE_NAME_CASE(VCVTBIASPH2BF8)
-  NODE_NAME_CASE(VCVTBIASPH2BF8S)
-  NODE_NAME_CASE(VCVTBIASPH2HF8)
-  NODE_NAME_CASE(VCVTBIASPH2HF8S)
-  NODE_NAME_CASE(VCVTPH2BF8)
-  NODE_NAME_CASE(VCVTPH2BF8S)
-  NODE_NAME_CASE(VCVTPH2HF8)
-  NODE_NAME_CASE(VCVTPH2HF8S)
-  NODE_NAME_CASE(VMCVTBIASPH2BF8)
-  NODE_NAME_CASE(VMCVTBIASPH2BF8S)
-  NODE_NAME_CASE(VMCVTBIASPH2HF8)
-  NODE_NAME_CASE(VMCVTBIASPH2HF8S)
-  NODE_NAME_CASE(VMCVTPH2BF8)
-  NODE_NAME_CASE(VMCVTPH2BF8S)
-  NODE_NAME_CASE(VMCVTPH2HF8)
-  NODE_NAME_CASE(VMCVTPH2HF8S)
-  NODE_NAME_CASE(VCVTHF82PH)
-  NODE_NAME_CASE(AESENC128KL)
-  NODE_NAME_CASE(AESDEC128KL)
-  NODE_NAME_CASE(AESENC256KL)
-  NODE_NAME_CASE(AESDEC256KL)
-  NODE_NAME_CASE(AESENCWIDE128KL)
-  NODE_NAME_CASE(AESDECWIDE128KL)
-  NODE_NAME_CASE(AESENCWIDE256KL)
-  NODE_NAME_CASE(AESDECWIDE256KL)
-  NODE_NAME_CASE(CMPCCXADD)
-  NODE_NAME_CASE(TESTUI)
-  NODE_NAME_CASE(FP80_ADD)
-  NODE_NAME_CASE(STRICT_FP80_ADD)
-  NODE_NAME_CASE(CCMP)
-  NODE_NAME_CASE(CTEST)
-  NODE_NAME_CASE(CLOAD)
-  NODE_NAME_CASE(CSTORE)
-  NODE_NAME_CASE(CVTTS2SIS)
-  NODE_NAME_CASE(CVTTS2UIS)
-  NODE_NAME_CASE(CVTTS2SIS_SAE)
-  NODE_NAME_CASE(CVTTS2UIS_SAE)
-  NODE_NAME_CASE(CVTTP2SIS)
-  NODE_NAME_CASE(MCVTTP2SIS)
-  NODE_NAME_CASE(CVTTP2UIS_SAE)
-  NODE_NAME_CASE(CVTTP2SIS_SAE)
-  NODE_NAME_CASE(CVTTP2UIS)
-  NODE_NAME_CASE(MCVTTP2UIS)
-  NODE_NAME_CASE(POP_FROM_X87_REG)
+  case X86ISD::FIRST_NUMBER:
+    break;
+#define NODE_NAME_CASE(NODE)                                                   \
+  case X86ISD::NODE:                                                           \
+    return "X86ISD::" #NODE;
+    NODE_NAME_CASE(BSF)
+    NODE_NAME_CASE(BSR)
+    NODE_NAME_CASE(FSHL)
+    NODE_NAME_CASE(FSHR)
+    NODE_NAME_CASE(FAND)
+    NODE_NAME_CASE(FANDN)
+    NODE_NAME_CASE(FOR)
+    NODE_NAME_CASE(FXOR)
+    NODE_NAME_CASE(FILD)
+    NODE_NAME_CASE(FIST)
+    NODE_NAME_CASE(FP_TO_INT_IN_MEM)
+    NODE_NAME_CASE(FLD)
+    NODE_NAME_CASE(FST)
+    NODE_NAME_CASE(CALL)
+    NODE_NAME_CASE(CALL_RVMARKER)
+    NODE_NAME_CASE(IMP_CALL)
+    NODE_NAME_CASE(BT)
+    NODE_NAME_CASE(CMP)
+    NODE_NAME_CASE(FCMP)
+    NODE_NAME_CASE(STRICT_FCMP)
+    NODE_NAME_CASE(STRICT_FCMPS)
+    NODE_NAME_CASE(COMI)
+    NODE_NAME_CASE(UCOMI)
+    NODE_NAME_CASE(COMX)
+    NODE_NAME_CASE(UCOMX)
+    NODE_NAME_CASE(CMPM)
+    NODE_NAME_CASE(CMPMM)
+    NODE_NAME_CASE(STRICT_CMPM)
+    NODE_NAME_CASE(CMPMM_SAE)
+    NODE_NAME_CASE(SETCC)
+    NODE_NAME_CASE(CTSELECT)
+    NODE_NAME_CASE(SETCC_CARRY)
+    NODE_NAME_CASE(FSETCC)
+    NODE_NAME_CASE(FSETCCM)
+    NODE_NAME_CASE(FSETCCM_SAE)
+    NODE_NAME_CASE(CMOV)
+    NODE_NAME_CASE(BRCOND)
+    NODE_NAME_CASE(RET_GLUE)
+    NODE_NAME_CASE(IRET)
+    NODE_NAME_CASE(REP_STOS)
+    NODE_NAME_CASE(REP_MOVS)
+    NODE_NAME_CASE(GlobalBaseReg)
+    NODE_NAME_CASE(Wrapper)
+    NODE_NAME_CASE(WrapperRIP)
+    NODE_NAME_CASE(MOVQ2DQ)
+    NODE_NAME_CASE(MOVDQ2Q)
+    NODE_NAME_CASE(MMX_MOVD2W)
+    NODE_NAME_CASE(MMX_MOVW2D)
+    NODE_NAME_CASE(PEXTRB)
+    NODE_NAME_CASE(PEXTRW)
+    NODE_NAME_CASE(INSERTPS)
+    NODE_NAME_CASE(PINSRB)
+    NODE_NAME_CASE(PINSRW)
+    NODE_NAME_CASE(PSHUFB)
+    NODE_NAME_CASE(ANDNP)
+    NODE_NAME_CASE(BLENDI)
+    NODE_NAME_CASE(BLENDV)
+    NODE_NAME_CASE(HADD)
+    NODE_NAME_CASE(HSUB)
+    NODE_NAME_CASE(FHADD)
+    NODE_NAME_CASE(FHSUB)
+    NODE_NAME_CASE(CONFLICT)
+    NODE_NAME_CASE(FMAX)
+    NODE_NAME_CASE(FMAXS)
+    NODE_NAME_CASE(FMAX_SAE)
+    NODE_NAME_CASE(FMAXS_SAE)
+    NODE_NAME_CASE(STRICT_FMAX)
+    NODE_NAME_CASE(FMIN)
+    NODE_NAME_CASE(FMINS)
+    NODE_NAME_CASE(FMIN_SAE)
+    NODE_NAME_CASE(FMINS_SAE)
+    NODE_NAME_CASE(STRICT_FMIN)
+    NODE_NAME_CASE(FMAXC)
+    NODE_NAME_CASE(FMINC)
+    NODE_NAME_CASE(FRSQRT)
+    NODE_NAME_CASE(FRCP)
+    NODE_NAME_CASE(EXTRQI)
+    NODE_NAME_CASE(INSERTQI)
+    NODE_NAME_CASE(TLSADDR)
+    NODE_NAME_CASE(TLSBASEADDR)
+    NODE_NAME_CASE(TLSCALL)
+    NODE_NAME_CASE(TLSDESC)
+    NODE_NAME_CASE(EH_SJLJ_SETJMP)
+    NODE_NAME_CASE(EH_SJLJ_LONGJMP)
+    NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
+    NODE_NAME_CASE(EH_RETURN)
+    NODE_NAME_CASE(TC_RETURN)
+    NODE_NAME_CASE(FNSTCW16m)
+    NODE_NAME_CASE(FLDCW16m)
+    NODE_NAME_CASE(FNSTENVm)
+    NODE_NAME_CASE(FLDENVm)
+    NODE_NAME_CASE(LCMPXCHG_DAG)
+    NODE_NAME_CASE(LCMPXCHG8_DAG)
+    NODE_NAME_CASE(LCMPXCHG16_DAG)
+    NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
+    NODE_NAME_CASE(LADD)
+    NODE_NAME_CASE(LSUB)
+    NODE_NAME_CASE(LOR)
+    NODE_NAME_CASE(LXOR)
+    NODE_NAME_CASE(LAND)
+    NODE_NAME_CASE(LBTS)
+    NODE_NAME_CASE(LBTC)
+    NODE_NAME_CASE(LBTR)
+    NODE_NAME_CASE(LBTS_RM)
+    NODE_NAME_CASE(LBTC_RM)
+    NODE_NAME_CASE(LBTR_RM)
+    NODE_NAME_CASE(AADD)
+    NODE_NAME_CASE(AOR)
+    NODE_NAME_CASE(AXOR)
+    NODE_NAME_CASE(AAND)
+    NODE_NAME_CASE(VZEXT_MOVL)
+    NODE_NAME_CASE(VZEXT_LOAD)
+    NODE_NAME_CASE(VEXTRACT_STORE)
+    NODE_NAME_CASE(VTRUNC)
+    NODE_NAME_CASE(VTRUNCS)
+    NODE_NAME_CASE(VTRUNCUS)
+    NODE_NAME_CASE(VMTRUNC)
+    NODE_NAME_CASE(VMTRUNCS)
+    NODE_NAME_CASE(VMTRUNCUS)
+    NODE_NAME_CASE(VTRUNCSTORES)
+    NODE_NAME_CASE(VTRUNCSTOREUS)
+    NODE_NAME_CASE(VMTRUNCSTORES)
+    NODE_NAME_CASE(VMTRUNCSTOREUS)
+    NODE_NAME_CASE(VFPEXT)
+    NODE_NAME_CASE(STRICT_VFPEXT)
+    NODE_NAME_CASE(VFPEXT_SAE)
+    NODE_NAME_CASE(VFPEXTS)
+    NODE_NAME_CASE(VFPEXTS_SAE)
+    NODE_NAME_CASE(VFPROUND)
+    NODE_NAME_CASE(VFPROUND2)
+    NODE_NAME_CASE(VFPROUND2_RND)
+    NODE_NAME_CASE(STRICT_VFPROUND)
+    NODE_NAME_CASE(VMFPROUND)
+    NODE_NAME_CASE(VFPROUND_RND)
+    NODE_NAME_CASE(VFPROUNDS)
+    NODE_NAME_CASE(VFPROUNDS_RND)
+    NODE_NAME_CASE(VSHLDQ)
+    NODE_NAME_CASE(VSRLDQ)
+    NODE_NAME_CASE(VSHL)
+    NODE_NAME_CASE(VSRL)
+    NODE_NAME_CASE(VSRA)
+    NODE_NAME_CASE(VSHLI)
+    NODE_NAME_CASE(VSRLI)
+    NODE_NAME_CASE(VSRAI)
+    NODE_NAME_CASE(VSHLV)
+    NODE_NAME_CASE(VSRLV)
+    NODE_NAME_CASE(VSRAV)
+    NODE_NAME_CASE(VROTLI)
+    NODE_NAME_CASE(VROTRI)
+    NODE_NAME_CASE(VPPERM)
+    NODE_NAME_CASE(CMPP)
+    NODE_NAME_CASE(STRICT_CMPP)
+    NODE_NAME_CASE(PCMPEQ)
+    NODE_NAME_CASE(PCMPGT)
+    NODE_NAME_CASE(PHMINPOS)
+    NODE_NAME_CASE(ADD)
+    NODE_NAME_CASE(SUB)
+    NODE_NAME_CASE(ADC)
+    NODE_NAME_CASE(SBB)
+    NODE_NAME_CASE(SMUL)
+    NODE_NAME_CASE(UMUL)
+    NODE_NAME_CASE(OR)
+    NODE_NAME_CASE(XOR)
+    NODE_NAME_CASE(AND)
+    NODE_NAME_CASE(BEXTR)
+    NODE_NAME_CASE(BEXTRI)
+    NODE_NAME_CASE(BZHI)
+    NODE_NAME_CASE(PDEP)
+    NODE_NAME_CASE(PEXT)
+    NODE_NAME_CASE(MUL_IMM)
+    NODE_NAME_CASE(MOVMSK)
+    NODE_NAME_CASE(PTEST)
+    NODE_NAME_CASE(TESTP)
+    NODE_NAME_CASE(KORTEST)
+    NODE_NAME_CASE(KTEST)
+    NODE_NAME_CASE(KADD)
+    NODE_NAME_CASE(KSHIFTL)
+    NODE_NAME_CASE(KSHIFTR)
+    NODE_NAME_CASE(PACKSS)
+    NODE_NAME_CASE(PACKUS)
+    NODE_NAME_CASE(PALIGNR)
+    NODE_NAME_CASE(VALIGN)
+    NODE_NAME_CASE(VSHLD)
+    NODE_NAME_CASE(VSHRD)
+    NODE_NAME_CASE(PSHUFD)
+    NODE_NAME_CASE(PSHUFHW)
+    NODE_NAME_CASE(PSHUFLW)
+    NODE_NAME_CASE(SHUFP)
+    NODE_NAME_CASE(SHUF128)
+    NODE_NAME_CASE(MOVLHPS)
+    NODE_NAME_CASE(MOVHLPS)
+    NODE_NAME_CASE(MOVDDUP)
+    NODE_NAME_CASE(MOVSHDUP)
+    NODE_NAME_CASE(MOVSLDUP)
+    NODE_NAME_CASE(MOVSD)
+    NODE_NAME_CASE(MOVSS)
+    NODE_NAME_CASE(MOVSH)
+    NODE_NAME_CASE(UNPCKL)
+    NODE_NAME_CASE(UNPCKH)
+    NODE_NAME_CASE(VBROADCAST)
+    NODE_NAME_CASE(VBROADCAST_LOAD)
+    NODE_NAME_CASE(VBROADCASTM)
+    NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
+    NODE_NAME_CASE(VPERMILPV)
+    NODE_NAME_CASE(VPERMILPI)
+    NODE_NAME_CASE(VPERM2X128)
+    NODE_NAME_CASE(VPERMV)
+    NODE_NAME_CASE(VPERMV3)
+    NODE_NAME_CASE(VPERMI)
+    NODE_NAME_CASE(VPTERNLOG)
+    NODE_NAME_CASE(FP_TO_SINT_SAT)
+    NODE_NAME_CASE(FP_TO_UINT_SAT)
+    NODE_NAME_CASE(VFIXUPIMM)
+    NODE_NAME_CASE(VFIXUPIMM_SAE)
+    NODE_NAME_CASE(VFIXUPIMMS)
+    NODE_NAME_CASE(VFIXUPIMMS_SAE)
+    NODE_NAME_CASE(VRANGE)
+    NODE_NAME_CASE(VRANGE_SAE)
+    NODE_NAME_CASE(VRANGES)
+    NODE_NAME_CASE(VRANGES_SAE)
+    NODE_NAME_CASE(PMULUDQ)
+    NODE_NAME_CASE(PMULDQ)
+    NODE_NAME_CASE(PSADBW)
+    NODE_NAME_CASE(DBPSADBW)
+    NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
+    NODE_NAME_CASE(VAARG_64)
+    NODE_NAME_CASE(VAARG_X32)
+    NODE_NAME_CASE(DYN_ALLOCA)
+    NODE_NAME_CASE(MFENCE)
+    NODE_NAME_CASE(SEG_ALLOCA)
+    NODE_NAME_CASE(PROBED_ALLOCA)
+    NODE_NAME_CASE(RDRAND)
+    NODE_NAME_CASE(RDSEED)
+    NODE_NAME_CASE(RDPKRU)
+    NODE_NAME_CASE(WRPKRU)
+    NODE_NAME_CASE(VPMADDUBSW)
+    NODE_NAME_CASE(VPMADDWD)
+    NODE_NAME_CASE(VPSHA)
+    NODE_NAME_CASE(VPSHL)
+    NODE_NAME_CASE(VPCOM)
+    NODE_NAME_CASE(VPCOMU)
+    NODE_NAME_CASE(VPERMIL2)
+    NODE_NAME_CASE(FMSUB)
+    NODE_NAME_CASE(STRICT_FMSUB)
+    NODE_NAME_CASE(FNMADD)
+    NODE_NAME_CASE(STRICT_FNMADD)
+    NODE_NAME_CASE(FNMSUB)
+    NODE_NAME_CASE(STRICT_FNMSUB)
+    NODE_NAME_CASE(FMADDSUB)
+    NODE_NAME_CASE(FMSUBADD)
+    NODE_NAME_CASE(FMADD_RND)
+    NODE_NAME_CASE(FNMADD_RND)
+    NODE_NAME_CASE(FMSUB_RND)
+    NODE_NAME_CASE(FNMSUB_RND)
+    NODE_NAME_CASE(FMADDSUB_RND)
+    NODE_NAME_CASE(FMSUBADD_RND)
+    NODE_NAME_CASE(VFMADDC)
+    NODE_NAME_CASE(VFMADDC_RND)
+    NODE_NAME_CASE(VFCMADDC)
+    NODE_NAME_CASE(VFCMADDC_RND)
+    NODE_NAME_CASE(VFMULC)
+    NODE_NAME_CASE(VFMULC_RND)
+    NODE_NAME_CASE(VFCMULC)
+    NODE_NAME_CASE(VFCMULC_RND)
+    NODE_NAME_CASE(VFMULCSH)
+    NODE_NAME_CASE(VFMULCSH_RND)
+    NODE_NAME_CASE(VFCMULCSH)
+    NODE_NAME_CASE(VFCMULCSH_RND)
+    NODE_NAME_CASE(VFMADDCSH)
+    NODE_NAME_CASE(VFMADDCSH_RND)
+    NODE_NAME_CASE(VFCMADDCSH)
+    NODE_NAME_CASE(VFCMADDCSH_RND)
+    NODE_NAME_CASE(VPMADD52H)
+    NODE_NAME_CASE(VPMADD52L)
+    NODE_NAME_CASE(VRNDSCALE)
+    NODE_NAME_CASE(STRICT_VRNDSCALE)
+    NODE_NAME_CASE(VRNDSCALE_SAE)
+    NODE_NAME_CASE(VRNDSCALES)
+    NODE_NAME_CASE(VRNDSCALES_SAE)
+    NODE_NAME_CASE(VREDUCE)
+    NODE_NAME_CASE(VREDUCE_SAE)
+    NODE_NAME_CASE(VREDUCES)
+    NODE_NAME_CASE(VREDUCES_SAE)
+    NODE_NAME_CASE(VGETMANT)
+    NODE_NAME_CASE(VGETMANT_SAE)
+    NODE_NAME_CASE(VGETMANTS)
+    NODE_NAME_CASE(VGETMANTS_SAE)
+    NODE_NAME_CASE(PCMPESTR)
+    NODE_NAME_CASE(PCMPISTR)
+    NODE_NAME_CASE(XTEST)
+    NODE_NAME_CASE(COMPRESS)
+    NODE_NAME_CASE(EXPAND)
+    NODE_NAME_CASE(SELECTS)
+    NODE_NAME_CASE(ADDSUB)
+    NODE_NAME_CASE(RCP14)
+    NODE_NAME_CASE(RCP14S)
+    NODE_NAME_CASE(RSQRT14)
+    NODE_NAME_CASE(RSQRT14S)
+    NODE_NAME_CASE(FADD_RND)
+    NODE_NAME_CASE(FADDS)
+    NODE_NAME_CASE(FADDS_RND)
+    NODE_NAME_CASE(FSUB_RND)
+    NODE_NAME_CASE(FSUBS)
+    NODE_NAME_CASE(FSUBS_RND)
+    NODE_NAME_CASE(FMUL_RND)
+    NODE_NAME_CASE(FMULS)
+    NODE_NAME_CASE(FMULS_RND)
+    NODE_NAME_CASE(FDIV_RND)
+    NODE_NAME_CASE(FDIVS)
+    NODE_NAME_CASE(FDIVS_RND)
+    NODE_NAME_CASE(FSQRT_RND)
+    NODE_NAME_CASE(FSQRTS)
+    NODE_NAME_CASE(FSQRTS_RND)
+    NODE_NAME_CASE(FGETEXP)
+    NODE_NAME_CASE(FGETEXP_SAE)
+    NODE_NAME_CASE(FGETEXPS)
+    NODE_NAME_CASE(FGETEXPS_SAE)
+    NODE_NAME_CASE(SCALEF)
+    NODE_NAME_CASE(SCALEF_RND)
+    NODE_NAME_CASE(SCALEFS)
+    NODE_NAME_CASE(SCALEFS_RND)
+    NODE_NAME_CASE(MULHRS)
+    NODE_NAME_CASE(SINT_TO_FP_RND)
+    NODE_NAME_CASE(UINT_TO_FP_RND)
+    NODE_NAME_CASE(CVTTP2SI)
+    NODE_NAME_CASE(CVTTP2UI)
+    NODE_NAME_CASE(STRICT_CVTTP2SI)
+    NODE_NAME_CASE(STRICT_CVTTP2UI)
+    NODE_NAME_CASE(MCVTTP2SI)
+    NODE_NAME_CASE(MCVTTP2UI)
+    NODE_NAME_CASE(CVTTP2SI_SAE)
+    NODE_NAME_CASE(CVTTP2UI_SAE)
+    NODE_NAME_CASE(CVTTS2SI)
+    NODE_NAME_CASE(CVTTS2UI)
+    NODE_NAME_CASE(CVTTS2SI_SAE)
+    NODE_NAME_CASE(CVTTS2UI_SAE)
+    NODE_NAME_CASE(CVTSI2P)
+    NODE_NAME_CASE(CVTUI2P)
+    NODE_NAME_CASE(STRICT_CVTSI2P)
+    NODE_NAME_CASE(STRICT_CVTUI2P)
+    NODE_NAME_CASE(MCVTSI2P)
+    NODE_NAME_CASE(MCVTUI2P)
+    NODE_NAME_CASE(VFPCLASS)
+    NODE_NAME_CASE(VFPCLASSS)
+    NODE_NAME_CASE(MULTISHIFT)
+    NODE_NAME_CASE(SCALAR_SINT_TO_FP)
+    NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
+    NODE_NAME_CASE(SCALAR_UINT_TO_FP)
+    NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
+    NODE_NAME_CASE(CVTPS2PH)
+    NODE_NAME_CASE(STRICT_CVTPS2PH)
+    NODE_NAME_CASE(CVTPS2PH_SAE)
+    NODE_NAME_CASE(MCVTPS2PH)
+    NODE_NAME_CASE(MCVTPS2PH_SAE)
+    NODE_NAME_CASE(CVTPH2PS)
+    NODE_NAME_CASE(STRICT_CVTPH2PS)
+    NODE_NAME_CASE(CVTPH2PS_SAE)
+    NODE_NAME_CASE(CVTP2SI)
+    NODE_NAME_CASE(CVTP2UI)
+    NODE_NAME_CASE(MCVTP2SI)
+    NODE_NAME_CASE(MCVTP2UI)
+    NODE_NAME_CASE(CVTP2SI_RND)
+    NODE_NAME_CASE(CVTP2UI_RND)
+    NODE_NAME_CASE(CVTS2SI)
+    NODE_NAME_CASE(CVTS2UI)
+    NODE_NAME_CASE(CVTS2SI_RND)
+    NODE_NAME_CASE(CVTS2UI_RND)
+    NODE_NAME_CASE(CVTNEPS2BF16)
+    NODE_NAME_CASE(MCVTNEPS2BF16)
+    NODE_NAME_CASE(DPBF16PS)
+    NODE_NAME_CASE(DPFP16PS)
+    NODE_NAME_CASE(MPSADBW)
+    NODE_NAME_CASE(LWPINS)
+    NODE_NAME_CASE(MGATHER)
+    NODE_NAME_CASE(MSCATTER)
+    NODE_NAME_CASE(VPDPBUSD)
+    NODE_NAME_CASE(VPDPBUSDS)
+    NODE_NAME_CASE(VPDPWSSD)
+    NODE_NAME_CASE(VPDPWSSDS)
+    NODE_NAME_CASE(VPSHUFBITQMB)
+    NODE_NAME_CASE(GF2P8MULB)
+    NODE_NAME_CASE(GF2P8AFFINEQB)
+    NODE_NAME_CASE(GF2P8AFFINEINVQB)
+    NODE_NAME_CASE(NT_CALL)
+    NODE_NAME_CASE(NT_BRIND)
+    NODE_NAME_CASE(UMWAIT)
+    NODE_NAME_CASE(TPAUSE)
+    NODE_NAME_CASE(ENQCMD)
+    NODE_NAME_CASE(ENQCMDS)
+    NODE_NAME_CASE(VP2INTERSECT)
+    NODE_NAME_CASE(VPDPBSUD)
+    NODE_NAME_CASE(VPDPBSUDS)
+    NODE_NAME_CASE(VPDPBUUD)
+    NODE_NAME_CASE(VPDPBUUDS)
+    NODE_NAME_CASE(VPDPBSSD)
+    NODE_NAME_CASE(VPDPBSSDS)
+    NODE_NAME_CASE(VPDPWSUD)
+    NODE_NAME_CASE(VPDPWSUDS)
+    NODE_NAME_CASE(VPDPWUSD)
+    NODE_NAME_CASE(VPDPWUSDS)
+    NODE_NAME_CASE(VPDPWUUD)
+    NODE_NAME_CASE(VPDPWUUDS)
+    NODE_NAME_CASE(VMINMAX)
+    NODE_NAME_CASE(VMINMAX_SAE)
+    NODE_NAME_CASE(VMINMAXS)
+    NODE_NAME_CASE(VMINMAXS_SAE)
+    NODE_NAME_CASE(CVTP2IBS)
+    NODE_NAME_CASE(CVTP2IUBS)
+    NODE_NAME_CASE(CVTP2IBS_RND)
+    NODE_NAME_CASE(CVTP2IUBS_RND)
+    NODE_NAME_CASE(CVTTP2IBS)
+    NODE_NAME_CASE(CVTTP2IUBS)
+    NODE_NAME_CASE(CVTTP2IBS_SAE)
+    NODE_NAME_CASE(CVTTP2IUBS_SAE)
+    NODE_NAME_CASE(VCVT2PH2BF8)
+    NODE_NAME_CASE(VCVT2PH2BF8S)
+    NODE_NAME_CASE(VCVT2PH2HF8)
+    NODE_NAME_CASE(VCVT2PH2HF8S)
+    NODE_NAME_CASE(VCVTBIASPH2BF8)
+    NODE_NAME_CASE(VCVTBIASPH2BF8S)
+    NODE_NAME_CASE(VCVTBIASPH2HF8)
+    NODE_NAME_CASE(VCVTBIASPH2HF8S)
+    NODE_NAME_CASE(VCVTPH2BF8)
+    NODE_NAME_CASE(VCVTPH2BF8S)
+    NODE_NAME_CASE(VCVTPH2HF8)
+    NODE_NAME_CASE(VCVTPH2HF8S)
+    NODE_NAME_CASE(VMCVTBIASPH2BF8)
+    NODE_NAME_CASE(VMCVTBIASPH2BF8S)
+    NODE_NAME_CASE(VMCVTBIASPH2HF8)
+    NODE_NAME_CASE(VMCVTBIASPH2HF8S)
+    NODE_NAME_CASE(VMCVTPH2BF8)
+    NODE_NAME_CASE(VMCVTPH2BF8S)
+    NODE_NAME_CASE(VMCVTPH2HF8)
+    NODE_NAME_CASE(VMCVTPH2HF8S)
+    NODE_NAME_CASE(VCVTHF82PH)
+    NODE_NAME_CASE(AESENC128KL)
+    NODE_NAME_CASE(AESDEC128KL)
+    NODE_NAME_CASE(AESENC256KL)
+    NODE_NAME_CASE(AESDEC256KL)
+    NODE_NAME_CASE(AESENCWIDE128KL)
+    NODE_NAME_CASE(AESDECWIDE128KL)
+    NODE_NAME_CASE(AESENCWIDE256KL)
+    NODE_NAME_CASE(AESDECWIDE256KL)
+    NODE_NAME_CASE(CMPCCXADD)
+    NODE_NAME_CASE(TESTUI)
+    NODE_NAME_CASE(FP80_ADD)
+    NODE_NAME_CASE(STRICT_FP80_ADD)
+    NODE_NAME_CASE(CCMP)
+    NODE_NAME_CASE(CTEST)
+    NODE_NAME_CASE(CLOAD)
+    NODE_NAME_CASE(CSTORE)
+    NODE_NAME_CASE(CVTTS2SIS)
+    NODE_NAME_CASE(CVTTS2UIS)
+    NODE_NAME_CASE(CVTTS2SIS_SAE)
+    NODE_NAME_CASE(CVTTS2UIS_SAE)
+    NODE_NAME_CASE(CVTTP2SIS)
+    NODE_NAME_CASE(MCVTTP2SIS)
+    NODE_NAME_CASE(CVTTP2UIS_SAE)
+    NODE_NAME_CASE(CVTTP2SIS_SAE)
+    NODE_NAME_CASE(CVTTP2UIS)
+    NODE_NAME_CASE(MCVTTP2UIS)
+    NODE_NAME_CASE(POP_FROM_X87_REG)
   }
   return nullptr;
 #undef NODE_NAME_CASE
@@ -35644,7 +35720,7 @@ bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
     if (AM.HasBaseReg)
       return false;
     break;
-  default:  // Other stuff never works.
+  default: // Other stuff never works.
     return false;
   }
 
@@ -35749,12 +35825,13 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   if (Val.getOpcode() != ISD::LOAD)
     return false;
 
-  if (!VT1.isSimple() || !VT1.isInteger() ||
-      !VT2.isSimple() || !VT2.isInteger())
+  if (!VT1.isSimple() || !VT1.isInteger() || !VT2.isSimple() ||
+      !VT2.isInteger())
     return false;
 
   switch (VT1.getSimpleVT().SimpleTy) {
-  default: break;
+  default:
+    break;
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
@@ -35985,8 +36062,10 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
   // sinkMBB:
   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
   BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
-      .addReg(mainDstReg).addMBB(mainMBB)
-      .addReg(fallDstReg).addMBB(fallMBB);
+      .addReg(mainDstReg)
+      .addMBB(mainMBB)
+      .addReg(fallDstReg)
+      .addMBB(fallMBB);
 
   MI.eraseFromParent();
   return sinkMBB;
@@ -36052,8 +36131,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
   unsigned TotalNumXMMRegs = 8;
   bool UseGPOffset = (ArgMode == 1);
   bool UseFPOffset = (ArgMode == 2);
-  unsigned MaxOffset = TotalNumIntRegs * 8 +
-                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+  unsigned MaxOffset =
+      TotalNumIntRegs * 8 + (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
 
   /* Align ArgSize to a multiple of 8 */
   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
@@ -36131,13 +36210,14 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
 
     // Check if there is enough room left to pull this argument.
     BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
-      .addReg(OffsetReg)
-      .addImm(MaxOffset + 8 - ArgSizeA8);
+        .addReg(OffsetReg)
+        .addImm(MaxOffset + 8 - ArgSizeA8);
 
     // Branch to "overflowMBB" if offset >= max
     // Fall through to "offsetMBB" otherwise
     BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
-      .addMBB(overflowMBB).addImm(X86::COND_AE);
+        .addMBB(overflowMBB)
+        .addImm(X86::COND_AE);
   }
 
   // In offsetMBB, emit code to use the reg_save_area.
@@ -36179,8 +36259,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
     // Compute the offset for the next argument
     Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
     BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
-      .addReg(OffsetReg)
-      .addImm(UseFPOffset ? 16 : 8);
+        .addReg(OffsetReg)
+        .addImm(UseFPOffset ? 16 : 8);
 
     // Store it back into the va_list.
     BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
@@ -36193,8 +36273,7 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
         .setMemRefs(StoreOnlyMMO);
 
     // Jump to endMBB
-    BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
-      .addMBB(endMBB);
+    BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1)).addMBB(endMBB);
   }
 
   //
@@ -36235,7 +36314,7 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
         .addImm(~(uint64_t)(Alignment.value() - 1));
   } else {
     BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
-      .addReg(OverflowAddrReg);
+        .addReg(OverflowAddrReg);
   }
 
   // Compute the next overflow address after this argument.
@@ -36261,10 +36340,11 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
 
   // If we branched, emit the PHI to the front of endMBB.
   if (offsetMBB) {
-    BuildMI(*endMBB, endMBB->begin(), MIMD,
-            TII->get(X86::PHI), DestReg)
-      .addReg(OffsetDestReg).addMBB(offsetMBB)
-      .addReg(OverflowDestReg).addMBB(overflowMBB);
+    BuildMI(*endMBB, endMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
+        .addReg(OffsetDestReg)
+        .addMBB(offsetMBB)
+        .addReg(OverflowDestReg)
+        .addMBB(overflowMBB);
   }
 
   // Erase the pseudo instruction
@@ -36279,8 +36359,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
 // kill marker, and set it if it should. Returns the correct kill
 // marker value.
 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
-                                     MachineBasicBlock* BB,
-                                     const TargetRegisterInfo* TRI) {
+                                     MachineBasicBlock *BB,
+                                     const TargetRegisterInfo *TRI) {
   if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
     return false;
 
@@ -36747,11 +36827,21 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
   //
   //       + ---- <- ------------ <- ------------- <- ------------ +
   //       |                                                       |
-  // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
-  //                                                               |                                                               |
-  //                                                               + <- ----------- <- ------------ <- ----------- <- ------------ +
+  // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn
+  // probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
+  //                                                               | |
+  //                                                               + <-
+  //                                                               -----------
+  //                                                               <-
+  //                                                               ------------
+  //                                                               <-
+  //                                                               -----------
+  //                                                               <-
+  //                                                               ------------
+  //                                                               +
   //
-  // The property we want to enforce is to never have more than [page alloc] between two probes.
+  // The property we want to enforce is to never have more than [page alloc]
+  // between two probes.
 
   const unsigned XORMIOpc =
       TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
@@ -36843,56 +36933,61 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
   // Add code to the main basic block to check if the stack limit has been hit,
   // and if so, jump to mallocMBB otherwise to bumpMBB.
   BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
-  BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
-    .addReg(tmpSPVReg).addReg(sizeVReg);
-  BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
-    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
-    .addReg(SPLimitVReg);
+  BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr : X86::SUB32rr), SPLimitVReg)
+      .addReg(tmpSPVReg)
+      .addReg(sizeVReg);
+  BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr : X86::CMP32mr))
+      .addReg(0)
+      .addImm(1)
+      .addReg(0)
+      .addImm(TlsOffset)
+      .addReg(TlsReg)
+      .addReg(SPLimitVReg);
   BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
 
   // bumpMBB simply decreases the stack pointer, since we know the current
   // stacklet has enough space.
   BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
-    .addReg(SPLimitVReg);
+      .addReg(SPLimitVReg);
   BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
-    .addReg(SPLimitVReg);
+      .addReg(SPLimitVReg);
   BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
 
   // Calls into a routine in libgcc to allocate more space from the heap.
   const uint32_t *RegMask =
       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
   if (IsLP64) {
-    BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
-      .addReg(sizeVReg);
+    BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI).addReg(sizeVReg);
     BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
-      .addExternalSymbol("__morestack_allocate_stack_space")
-      .addRegMask(RegMask)
-      .addReg(X86::RDI, RegState::Implicit)
-      .addReg(X86::RAX, RegState::ImplicitDefine);
+        .addExternalSymbol("__morestack_allocate_stack_space")
+        .addRegMask(RegMask)
+        .addReg(X86::RDI, RegState::Implicit)
+        .addReg(X86::RAX, RegState::ImplicitDefine);
   } else if (Is64Bit) {
-    BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
-      .addReg(sizeVReg);
+    BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI).addReg(sizeVReg);
     BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
-      .addExternalSymbol("__morestack_allocate_stack_space")
-      .addRegMask(RegMask)
-      .addReg(X86::EDI, RegState::Implicit)
-      .addReg(X86::EAX, RegState::ImplicitDefine);
+        .addExternalSymbol("__morestack_allocate_stack_space")
+        .addRegMask(RegMask)
+        .addReg(X86::EDI, RegState::Implicit)
+        .addReg(X86::EAX, RegState::ImplicitDefine);
   } else {
-    BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
-      .addImm(12);
+    BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg)
+        .addReg(physSPReg)
+        .addImm(12);
     BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
     BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
-      .addExternalSymbol("__morestack_allocate_stack_space")
-      .addRegMask(RegMask)
-      .addReg(X86::EAX, RegState::ImplicitDefine);
+        .addExternalSymbol("__morestack_allocate_stack_space")
+        .addRegMask(RegMask)
+        .addReg(X86::EAX, RegState::ImplicitDefine);
   }
 
   if (!Is64Bit)
-    BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
-      .addImm(16);
+    BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg)
+        .addReg(physSPReg)
+        .addImm(16);
 
   BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
-    .addReg(IsLP64 ? X86::RAX : X86::EAX);
+      .addReg(IsLP64 ? X86::RAX : X86::EAX);
   BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
 
   // Set up the CFG correctly.
@@ -36947,7 +37042,8 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
   RestoreMBB->setIsEHPad(true);
 
   auto RestoreMBBI = RestoreMBB->begin();
-  BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
+  BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4))
+      .addMBB(TargetMBB);
   return BB;
 }
 
@@ -36969,9 +37065,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   // proper register mask.
   const uint32_t *RegMask =
-      Subtarget.is64Bit() ?
-      Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
-      Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
+      Subtarget.is64Bit()
+          ? Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask()
+          : Subtarget.getRegisterInfo()->getCallPreservedMask(*F,
+                                                              CallingConv::C);
   if (Subtarget.is64Bit()) {
     MachineInstrBuilder MIB =
         BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
@@ -37227,8 +37324,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
   MemOpndSlot = CurOp;
 
   MVT PVT = getPointerTy(MF->getDataLayout());
-  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
-         "Invalid Pointer Size!");
+  assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
 
   // For v = setjmp(buf), we generate
   //
@@ -37276,19 +37372,19 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
     LabelReg = MRI.createVirtualRegister(PtrRC);
     if (Subtarget.is64Bit()) {
       MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
-              .addReg(X86::RIP)
-              .addImm(0)
-              .addReg(0)
-              .addMBB(restoreMBB)
-              .addReg(0);
+                .addReg(X86::RIP)
+                .addImm(0)
+                .addReg(0)
+                .addMBB(restoreMBB)
+                .addReg(0);
     } else {
-      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+      const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
       MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
-              .addReg(XII->getGlobalBaseReg(MF))
-              .addImm(0)
-              .addReg(0)
-              .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
-              .addReg(0);
+                .addReg(XII->getGlobalBaseReg(MF))
+                .addImm(0)
+                .addReg(0)
+                .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
+                .addReg(0);
     }
   } else
     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
@@ -37312,7 +37408,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 
   // Setup
   MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
-          .addMBB(restoreMBB);
+            .addMBB(restoreMBB);
 
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   MIB.addRegMask(RegInfo->getNoPreservedMask());
@@ -37339,9 +37435,9 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
     Register FramePtr = RegInfo->getFrameRegister(*MF);
     Register BasePtr = RegInfo->getBaseRegister();
     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
-    addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
-                 FramePtr, true, X86FI->getRestoreBasePointerOffset())
-      .setMIFlag(MachineInstr::FrameSetup);
+    addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr), FramePtr,
+                 true, X86FI->getRestoreBasePointerOffset())
+        .setMIFlag(MachineInstr::FrameSetup);
   }
   BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
   BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
@@ -37424,9 +37520,9 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
   if (PVT == MVT::i64) {
     Register TmpZReg = MRI.createVirtualRegister(PtrRC);
     BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
-      .addImm(0)
-      .addReg(ZReg)
-      .addImm(X86::sub_32bit);
+        .addImm(0)
+        .addReg(ZReg)
+        .addImm(X86::sub_32bit);
     ZReg = TmpZReg;
   }
 
@@ -37557,11 +37653,10 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
 
   MVT PVT = getPointerTy(MF->getDataLayout());
-  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
-         "Invalid Pointer Size!");
+  assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
 
   const TargetRegisterClass *RC =
-    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+      (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   Register Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -37944,10 +38039,8 @@ X86TargetLowering::emitPatchableEventCall(MachineInstr &MI,
 /// This approach ensures that when i64 is type-legalized into two i32
 /// operations, both operations share the same condition byte rather than
 /// each independently reading (and destroying) EFLAGS.
-static MachineBasicBlock *
-emitCTSelectI386WithConditionMaterialization(MachineInstr &MI,
-                                              MachineBasicBlock *BB,
-                                              unsigned InternalPseudoOpcode) {
+static MachineBasicBlock *emitCTSelectI386WithConditionMaterialization(
+    MachineInstr &MI, MachineBasicBlock *BB, unsigned InternalPseudoOpcode) {
   const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
   const MIMetadata MIMD(MI);
   MachineFunction *MF = BB->getParent();
@@ -37991,12 +38084,12 @@ emitCTSelectI386WithConditionMaterialization(MachineInstr &MI,
   }
 
   BuildMI(*BB, MI, MIMD, TII->get(InternalPseudoOpcode))
-      .addDef(DstReg)         // dst (output)
-      .addDef(TmpByteReg)     // tmp_byte (output)
-      .addDef(TmpMaskReg)     // tmp_mask (output)
-      .addReg(Src1Reg)        // src1 (input)
-      .addReg(Src2Reg)        // src2 (input)
-      .addReg(CondByteReg);   // pre-materialized condition byte (input)
+      .addDef(DstReg)       // dst (output)
+      .addDef(TmpByteReg)   // tmp_byte (output)
+      .addDef(TmpMaskReg)   // tmp_mask (output)
+      .addReg(Src1Reg)      // src1 (input)
+      .addReg(Src2Reg)      // src2 (input)
+      .addReg(CondByteReg); // pre-materialized condition byte (input)
 
   MI.eraseFromParent();
   return BB;
@@ -38022,8 +38115,8 @@ struct FPLoadMemOperands {
 // Check if a virtual register is defined by a simple FP load instruction
 // Returns the memory operands if it's a simple load, otherwise returns invalid
 static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
-                                               MachineRegisterInfo &MRI,
-                                               unsigned ExpectedLoadOpcode) {
+                                              MachineRegisterInfo &MRI,
+                                              unsigned ExpectedLoadOpcode) {
   FPLoadMemOperands Result;
 
   if (!Reg.isVirtual())
@@ -38042,9 +38135,9 @@ static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
   if (DefMI->hasOrderedMemoryRef())
     return Result;
 
-  // The load should have a single def (the destination register) and memory operands
-  // Format: %reg = LD_Fpxxm <fi#N>, 1, %noreg, 0, %noreg
-  // or: %reg = LD_Fpxxm %base, scale, %index, disp, %segment
+  // The load should have a single def (the destination register) and memory
+  // operands Format: %reg = LD_Fpxxm <fi#N>, 1, %noreg, 0, %noreg or: %reg =
+  // LD_Fpxxm %base, scale, %index, disp, %segment
   if (DefMI->getNumOperands() < 6)
     return Result;
 
@@ -38069,9 +38162,8 @@ static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
 
   // Check if this is a constant pool load
   // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, %const.N, $noreg
-  if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
-      ScaleMO.isImm() && IndexMO.isReg() &&
-      IndexMO.getReg() == X86::NoRegister &&
+  if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && ScaleMO.isImm() &&
+      IndexMO.isReg() && IndexMO.getReg() == X86::NoRegister &&
       DispMO.isCPI() && SegMO.isReg()) {
     Result.IsValid = true;
     Result.IsConstantPool = true;
@@ -38085,9 +38177,8 @@ static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
 
   // Check if this is a global variable load
   // Format: %reg = LD_Fpxxm $noreg, 1, $noreg, @global_name, $noreg
-  if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister &&
-      ScaleMO.isImm() && IndexMO.isReg() &&
-      IndexMO.getReg() == X86::NoRegister &&
+  if (BaseMO.isReg() && BaseMO.getReg() == X86::NoRegister && ScaleMO.isImm() &&
+      IndexMO.isReg() && IndexMO.getReg() == X86::NoRegister &&
       DispMO.isGlobal() && SegMO.isReg()) {
     Result.IsValid = true;
     Result.IsGlobal = true;
@@ -38101,8 +38192,8 @@ static FPLoadMemOperands getFPLoadMemOperands(Register Reg,
   }
 
   // Regular memory operands (e.g., pointer loads)
-  if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() &&
-      DispMO.isImm() && SegMO.isReg()) {
+  if (BaseMO.isReg() && ScaleMO.isImm() && IndexMO.isReg() && DispMO.isImm() &&
+      SegMO.isReg()) {
     Result.IsValid = true;
     Result.IsFrameIndex = false;
     Result.IsConstantPool = false;
@@ -38128,7 +38219,8 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
   unsigned RegSizeInByte = 4;
 
   // Get operands
-  // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80, %cond:i8imm
+  // MI operands: %result:rfp80 = CTSELECT_I386 %false:rfp80, %true:rfp80,
+  // %cond:i8imm
   unsigned DestReg = MI.getOperand(0).getReg();
   unsigned FalseReg = MI.getOperand(1).getReg();
   unsigned TrueReg = MI.getOperand(2).getReg();
@@ -38146,7 +38238,7 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
 
   // Helper to load integer from memory operands
   auto loadIntFromMemOperands = [&](const FPLoadMemOperands &MemOps,
-                                     unsigned Offset) -> unsigned {
+                                    unsigned Offset) -> unsigned {
     unsigned IntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
     MachineInstrBuilder MIB =
         BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), IntReg);
@@ -38162,18 +38254,21 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
       // Constant pool: base_reg + scale + index + CP_index + segment
       // MOV32rm format: base, scale, index, displacement, segment
       MIB.addReg(X86::NoRegister)  // Base register
-          .addImm(MemOps.ScaleVal)  // Scale
-          .addReg(MemOps.IndexReg)  // Index register
-          .addConstantPoolIndex(MemOps.ConstantPoolIndex, Offset)  // Displacement (CP index)
-          .addReg(MemOps.SegReg);  // Segment
+          .addImm(MemOps.ScaleVal) // Scale
+          .addReg(MemOps.IndexReg) // Index register
+          .addConstantPoolIndex(MemOps.ConstantPoolIndex,
+                                Offset) // Displacement (CP index)
+          .addReg(MemOps.SegReg);       // Segment
     } else if (MemOps.IsGlobal) {
       // Global variable: base_reg + scale + index + global + segment
       // MOV32rm format: base, scale, index, displacement, segment
       MIB.addReg(X86::NoRegister)  // Base register
-          .addImm(MemOps.ScaleVal)  // Scale
-          .addReg(MemOps.IndexReg)  // Index register
-          .addGlobalAddress(MemOps.Global, MemOps.GlobalOffset + Offset)  // Displacement (global address)
-          .addReg(MemOps.SegReg);  // Segment
+          .addImm(MemOps.ScaleVal) // Scale
+          .addReg(MemOps.IndexReg) // Index register
+          .addGlobalAddress(MemOps.Global,
+                            MemOps.GlobalOffset +
+                                Offset) // Displacement (global address)
+          .addReg(MemOps.SegReg);       // Segment
     } else {
       // Regular memory: base_reg + scale + index + disp + segment
       MIB.addReg(MemOps.BaseReg)
@@ -38188,45 +38283,47 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
 
   // Optimized path: load integers directly from memory when both operands are
   // memory loads, avoiding FP register round-trip
-  auto emitCtSelectFromMemory = [&](unsigned NumValues,
-                                     const FPLoadMemOperands &TrueMemOps,
-                                     const FPLoadMemOperands &FalseMemOps,
-                                     int ResultSlot) {
-    for (unsigned Val = 0; Val < NumValues; ++Val) {
-      unsigned Offset = Val * RegSizeInByte;
-
-      // Load true and false values directly from their memory locations as integers
-      unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset);
-      unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset);
-
-      // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
-      unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
-      unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
-      unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
-
-      BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
-          .addDef(ResultIntReg)    // dst (output)
-          .addDef(TmpByteReg)      // tmp_byte (output)
-          .addDef(TmpMaskReg)      // tmp_mask (output)
-          .addReg(FalseIntReg)     // src1 (input) - false value
-          .addReg(TrueIntReg)      // src2 (input) - true value
-          .addReg(CondByteReg);    // pre-materialized condition byte (input)
-
-      // Store result back to result slot
-      BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
-          .addFrameIndex(ResultSlot)
-          .addImm(1)
-          .addReg(0)
-          .addImm(Offset)
-          .addReg(0)
-          .addReg(ResultIntReg, RegState::Kill);
-    }
-  };
+  auto emitCtSelectFromMemory =
+      [&](unsigned NumValues, const FPLoadMemOperands &TrueMemOps,
+          const FPLoadMemOperands &FalseMemOps, int ResultSlot) {
+        for (unsigned Val = 0; Val < NumValues; ++Val) {
+          unsigned Offset = Val * RegSizeInByte;
+
+          // Load true and false values directly from their memory locations as
+          // integers
+          unsigned TrueIntReg = loadIntFromMemOperands(TrueMemOps, Offset);
+          unsigned FalseIntReg = loadIntFromMemOperands(FalseMemOps, Offset);
+
+          // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time
+          // selection
+          unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+          unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
+          unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+
+          BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
+              .addDef(ResultIntReg) // dst (output)
+              .addDef(TmpByteReg)   // tmp_byte (output)
+              .addDef(TmpMaskReg)   // tmp_mask (output)
+              .addReg(FalseIntReg)  // src1 (input) - false value
+              .addReg(TrueIntReg)   // src2 (input) - true value
+              .addReg(CondByteReg); // pre-materialized condition byte (input)
+
+          // Store result back to result slot
+          BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
+              .addFrameIndex(ResultSlot)
+              .addImm(1)
+              .addReg(0)
+              .addImm(Offset)
+              .addReg(0)
+              .addReg(ResultIntReg, RegState::Kill);
+        }
+      };
 
-  auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot, int FalseSlot, int ResultSlot) {
+  auto emitCtSelectWithPseudo = [&](unsigned NumValues, int TrueSlot,
+                                    int FalseSlot, int ResultSlot) {
     for (unsigned Val = 0; Val < NumValues; ++Val) {
       unsigned Offset = Val * RegSizeInByte;
-      
+
       // Load true and false values from stack as 32-bit integers
       unsigned TrueIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
       BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), TrueIntReg)
@@ -38244,18 +38341,19 @@ static MachineBasicBlock *emitCTSelectI386WithFpType(MachineInstr &MI,
           .addImm(Offset)
           .addReg(0);
 
-      // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time selection
+      // Use CTSELECT_I386_INT_GR32 pseudo instruction for constant-time
+      // selection
       unsigned ResultIntReg = MRI.createVirtualRegister(&X86::GR32RegClass);
       unsigned TmpByteReg = MRI.createVirtualRegister(&X86::GR8RegClass);
       unsigned TmpMaskReg = MRI.createVirtualRegister(&X86::GR32RegClass);
-      
+
       BuildMI(*BB, MI, MIMD, TII->get(X86::CTSELECT_I386_INT_GR32rr))
-          .addDef(ResultIntReg)     // dst (output)
-          .addDef(TmpByteReg)       // tmp_byte (output)
-          .addDef(TmpMaskReg)       // tmp_mask (output)
-          .addReg(FalseIntReg)      // src1 (input) - false value
-          .addReg(TrueIntReg)       // src2 (input) - true value
-          .addReg(CondByteReg);     // pre-materialized condition byte (input)
+          .addDef(ResultIntReg) // dst (output)
+          .addDef(TmpByteReg)   // tmp_byte (output)
+          .addDef(TmpMaskReg)   // tmp_mask (output)
+          .addReg(FalseIntReg)  // src1 (input) - false value
+          .addReg(TrueIntReg)   // src2 (input) - true value
+          .addReg(CondByteReg); // pre-materialized condition byte (input)
 
       // Store result back to result slot
       BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32mr))
@@ -38416,7 +38514,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   const MIMetadata MIMD(MI);
 
   auto TMMImmToTMMReg = [](unsigned Imm) {
-    assert (Imm < 8 && "Illegal tmm index");
+    assert(Imm < 8 && "Illegal tmm index");
     return X86::TMM0 + Imm;
   };
   switch (MI.getOpcode()) {
@@ -38483,7 +38581,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP64rr);
   case X86::CTSELECT_I386_FP80rr:
     return emitCTSelectI386WithFpType(MI, BB, X86::CTSELECT_I386_FP80rr);
-    
+
   case X86::FP80_ADDr:
   case X86::FP80_ADDm32: {
     // Change the floating point control register to use double extended
@@ -38571,29 +38669,30 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
     BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
-      .addReg(OldCW, RegState::Kill).addImm(0xC00);
+        .addReg(OldCW, RegState::Kill)
+        .addImm(0xC00);
 
     // Extract to 16 bits.
     Register NewCW16 =
         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
     BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
-      .addReg(NewCW, RegState::Kill, X86::sub_16bit);
+        .addReg(NewCW, RegState::Kill, X86::sub_16bit);
 
     // Prepare memory for FLDCW.
     int NewCWFrameIdx =
         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
                       NewCWFrameIdx)
-      .addReg(NewCW16, RegState::Kill);
+        .addReg(NewCW16, RegState::Kill);
 
     // Reload the modified control word now...
-    addFrameReference(BuildMI(*BB, MI, MIMD,
-                              TII->get(X86::FLDCW16m)), NewCWFrameIdx);
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
+                      NewCWFrameIdx);
 
     // Get the X86 opcode to use.
     unsigned Opc;
     switch (MI.getOpcode()) {
-    // clang-format off
+      // clang-format off
     default: llvm_unreachable("illegal opcode!");
     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
@@ -38604,7 +38703,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
-    // clang-format on
+      // clang-format on
     }
 
     X86AddressMode AM = getAddressFromInstr(&MI, 0);
@@ -38821,7 +38920,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTMMULTF32PS: {
     unsigned Opc;
     switch (MI.getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
+    default:
+      llvm_unreachable("illegal opcode!");
       // clang-format off
     case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
     case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
@@ -38868,7 +38968,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTILESTORED: {
     unsigned Opc;
     switch (MI.getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
+    default:
+      llvm_unreachable("illegal opcode!");
 #define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
     case X86::PTILELOADD:
       Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
@@ -38990,11 +39091,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 //                           X86 Optimization Hooks
 //===----------------------------------------------------------------------===//
 
-bool
-X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
-                                                const APInt &DemandedBits,
-                                                const APInt &DemandedElts,
-                                                TargetLoweringOpt &TLO) const {
+bool X86TargetLowering::targetShrinkDemandedConstant(
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    TargetLoweringOpt &TLO) const {
   EVT VT = Op.getValueType();
   unsigned Opcode = Op.getOpcode();
   unsigned EltSize = VT.getScalarSizeInBits();
@@ -39179,16 +39278,15 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   unsigned NumElts = DemandedElts.getBitWidth();
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
-  assert((Opc >= ISD::BUILTIN_OP_END ||
-          Opc == ISD::INTRINSIC_WO_CHAIN ||
-          Opc == ISD::INTRINSIC_W_CHAIN ||
-          Opc == ISD::INTRINSIC_VOID) &&
+  assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN ||
+          Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) &&
          "Should use MaskedValueIsZero if you don't know whether Op"
          " is a target node!");
 
   Known.resetAll();
   switch (Opc) {
-  default: break;
+  default:
+    break;
   case X86ISD::MUL_IMM: {
     KnownBits Known2;
     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -39417,7 +39515,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
 
-    if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+    if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
 
@@ -39611,7 +39709,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       unsigned NumElts = VT.getVectorNumElements();
       if (Mask.size() == NumElts) {
         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
-        Known.Zero.setAllBits(); Known.One.setAllBits();
+        Known.Zero.setAllBits();
+        Known.One.setAllBits();
         for (unsigned i = 0; i != NumElts; ++i) {
           if (!DemandedElts[i])
             continue;
@@ -39756,16 +39855,18 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
   case X86ISD::ANDNP: {
     unsigned Tmp0 =
         DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
-    if (Tmp0 == 1) return 1; // Early out.
+    if (Tmp0 == 1)
+      return 1; // Early out.
     unsigned Tmp1 =
         DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
     return std::min(Tmp0, Tmp1);
   }
 
   case X86ISD::CMOV: {
-    unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
-    if (Tmp0 == 1) return 1;  // Early out.
-    unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    if (Tmp0 == 1)
+      return 1; // Early out.
+    unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
     return std::min(Tmp0, Tmp1);
   }
   }
@@ -40141,7 +40242,6 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
         PermuteImm = (unsigned)ShiftAmt;
         return true;
       }
-
     }
   }
 
@@ -40201,7 +40301,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
 
   // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
   if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
-      ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
+      ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) &&
+       Subtarget.hasInt256()) ||
       ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
     if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
                              Subtarget)) {
@@ -40760,9 +40861,9 @@ static SDValue combineX86ShuffleChain(
         SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
         SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
         return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
-                          CanonicalizeShuffleInput(RootVT, LHS),
-                          CanonicalizeShuffleInput(RootVT, RHS),
-                          DAG.getTargetConstant(PermMask, DL, MVT::i8));
+                           CanonicalizeShuffleInput(RootVT, LHS),
+                           CanonicalizeShuffleInput(RootVT, RHS),
+                           DAG.getTargetConstant(PermMask, DL, MVT::i8));
       }
     }
   }
@@ -40856,8 +40957,8 @@ static SDValue combineX86ShuffleChain(
     }
 
     if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
-                                 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
-                                 PermuteImm) &&
+                                 AllowIntDomain, DAG, Subtarget, Shuffle,
+                                 ShuffleVT, PermuteImm) &&
         (!IsMaskedShuffle ||
          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
       if (Depth == 0 && RootOpc == Shuffle)
@@ -41736,11 +41837,9 @@ static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef<SDValue> Ops,
 }
 
 namespace llvm {
-  namespace X86 {
-    enum {
-      MaxShuffleCombineDepth = 8
-    };
-  } // namespace X86
+namespace X86 {
+enum { MaxShuffleCombineDepth = 8 };
+} // namespace X86
 } // namespace llvm
 
 /// Fully generic combining of x86 shuffle instructions.
@@ -42144,7 +42243,8 @@ static SDValue combineX86ShufflesRecursively(
 
     // The Op itself may be of different VT, so we need to scale the mask.
     unsigned NumOpElts = Op.getValueType().getVectorNumElements();
-    APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
+    APInt OpScaledDemandedElts =
+        APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
 
     // Can this operand be simplified any further, given it's demanded elements?
     if (SDValue NewOp = TLI.SimplifyMultipleUseDemandedVectorElts(
@@ -42950,7 +43050,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
         ISD::isNormalLoad(Src.getNode())) {
       LoadSDNode *LN = cast<LoadSDNode>(Src);
       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-      SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+      SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
       SDValue BcastLd =
           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
                                   LN->getMemoryVT(), LN->getMemOperand());
@@ -42982,7 +43082,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
         // Unless its volatile or atomic.
         if (LN->isSimple()) {
           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-          SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+          SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
           SDValue BcastLd = DAG.getMemIntrinsicNode(
               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
               LN->getPointerInfo(), LN->getBaseAlign(),
@@ -43000,7 +43100,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
         LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
         if (LN->getMemoryVT().getSizeInBits() == 16) {
           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-          SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+          SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
           SDValue BcastLd =
               DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
                                       LN->getMemoryVT(), LN->getMemOperand());
@@ -43027,7 +43127,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
           SDValue Ptr = DAG.getMemBasePlusOffset(
               LN->getBasePtr(), TypeSize::getFixed(Offset), DL);
-          SDValue Ops[] = { LN->getChain(), Ptr };
+          SDValue Ops[] = {LN->getChain(), Ptr};
           SDValue BcastLd = DAG.getMemIntrinsicNode(
               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
               LN->getPointerInfo().getWithOffset(Offset), LN->getBaseAlign(),
@@ -43045,7 +43145,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
       MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
       if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-        SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+        SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
         SDValue BcastLd =
             DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
                                     LN->getMemoryVT(), LN->getMemOperand());
@@ -43554,13 +43654,13 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
     if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
       auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
       if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
-        SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
-                                   MemIntr->getBasePtr(),
-                                   MemIntr->getMemOperand());
-        SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
-                           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
-                                       Load),
-                           DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
+        SDValue Load =
+            DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
+                        MemIntr->getBasePtr(), MemIntr->getMemOperand());
+        SDValue Insert = DAG.getNode(
+            X86ISD::INSERTPS, DL, VT, Op0,
+            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Load),
+            DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
         DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
         return Insert;
       }
@@ -43714,8 +43814,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
         (V.getOpcode() == X86ISD::PSHUFLW ||
          V.getOpcode() == X86ISD::PSHUFHW) &&
-        V.getOpcode() != N.getOpcode() &&
-        V.hasOneUse() && V.getOperand(0).hasOneUse()) {
+        V.getOpcode() != N.getOpcode() && V.hasOneUse() &&
+        V.getOperand(0).hasOneUse()) {
       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
       if (D.getOpcode() == X86ISD::PSHUFD) {
         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
@@ -43789,11 +43889,11 @@ static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
 /// are written to the parameters \p Opnd0 and \p Opnd1.
 ///
-/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
-/// so it is easier to generically match. We also insert dummy vector shuffle
-/// nodes for the operands which explicitly discard the lanes which are unused
-/// by this operation to try to flow through the rest of the combiner
-/// the fact that they're unused.
+/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle
+/// nodes so it is easier to generically match. We also insert dummy vector
+/// shuffle nodes for the operands which explicitly discard the lanes which are
+/// unused by this operation to try to flow through the rest of the combiner the
+/// fact that they're unused.
 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
                              bool &IsSubAdd, bool &HasAllowContract) {
@@ -43827,13 +43927,15 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
   // commute the FADD operands.
   SDValue LHS, RHS;
   if (V1.getOpcode() == ISD::FSUB) {
-    LHS = V1->getOperand(0); RHS = V1->getOperand(1);
+    LHS = V1->getOperand(0);
+    RHS = V1->getOperand(1);
     if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
         (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
       return false;
   } else {
     assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
-    LHS = V2->getOperand(0); RHS = V2->getOperand(1);
+    LHS = V2->getOperand(0);
+    RHS = V2->getOperand(1);
     if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
         (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
       return false;
@@ -43845,8 +43947,8 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
     return false;
 
   // It's a subadd if the vector in the even parity is an FADD.
-  IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
-                     : V2->getOpcode() == ISD::FADD;
+  IsSubAdd =
+      Op0Even ? V1->getOpcode() == ISD::FADD : V2->getOpcode() == ISD::FADD;
   HasAllowContract =
       V1->getFlags().hasAllowContract() && V2->getFlags().hasAllowContract();
 
@@ -44135,7 +44237,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
 
     // TODO: Multiply by zero.
 
-    // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
+    // If RHS/LHS elements are known zero then we don't need the LHS/RHS
+    // equivalent.
     APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
     if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
                                    Depth + 1))
@@ -44909,7 +45012,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
   // For splats, unless we *only* demand the 0'th element,
   // stop attempts at simplification here, we aren't going to improve things,
   // this is better than any potential shuffle.
-  if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
+  if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/ false))
     return false;
 
   // Get target/faux shuffle mask.
@@ -45007,7 +45110,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
   EVT VT = Op.getValueType();
   unsigned BitWidth = OriginalDemandedBits.getBitWidth();
   unsigned Opc = Op.getOpcode();
-  switch(Opc) {
+  switch (Opc) {
   case X86ISD::VTRUNC: {
     KnownBits KnownOp;
     SDValue Src = Op.getOperand(0);
@@ -45015,8 +45118,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
 
     // Simplify the input, using demanded bit information.
     APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
-    APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
-    if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
+    APInt DemandedElts =
+        OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
+    if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO,
+                             Depth + 1))
       return true;
     break;
   }
@@ -45120,7 +45225,8 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
       }
     }
 
-    // If we are only demanding sign bits then we can use the shift source directly.
+    // If we are only demanding sign bits then we can use the shift source
+    // directly.
     unsigned NumSignBits =
         TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
     unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
@@ -45311,8 +45417,8 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
         return true;
 
       KnownBits KnownVec;
-      if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
-                               KnownVec, TLO, Depth + 1))
+      if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts, KnownVec,
+                               TLO, Depth + 1))
         return true;
 
       if (SDValue V = SimplifyMultipleUseDemandedBits(
@@ -45948,13 +46054,13 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
 
 // Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
 static unsigned getAltBitOpcode(unsigned Opcode) {
-  switch(Opcode) {
-  // clang-format off
+  switch (Opcode) {
+    // clang-format off
   case ISD::AND: return X86ISD::FAND;
   case ISD::OR: return X86ISD::FOR;
   case ISD::XOR: return X86ISD::FXOR;
   case X86ISD::ANDNP: return X86ISD::FANDN;
-  // clang-format on
+    // clang-format on
   }
   llvm_unreachable("Unknown bitwise opcode");
 }
@@ -46177,8 +46283,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
 // Convert a vXi1 constant build vector to the same width scalar integer.
 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
   EVT SrcVT = Op.getValueType();
-  assert(SrcVT.getVectorElementType() == MVT::i1 &&
-         "Expected a vXi1 vector");
+  assert(SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector");
   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
          "Expected a constant build vector");
 
@@ -46496,7 +46601,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
   // and the vbroadcast_load are both integer or both fp. In some cases this
   // will remove the bitcast entirely.
   if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
-       VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
+      VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
     auto *BCast = cast<MemIntrinsicSDNode>(N0);
     unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
     unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
@@ -46509,7 +46614,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
       LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
 
       SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
-      SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
+      SDValue Ops[] = {BCast->getChain(), BCast->getBasePtr()};
       SDValue ResNode =
           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
                                   MemVT, BCast->getMemOperand());
@@ -46559,7 +46664,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
       bool LowUndef = true, AllUndefOrZero = true;
       for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
         SDValue Op = N0.getOperand(i);
-        LowUndef &= Op.isUndef() || (i >= e/2);
+        LowUndef &= Op.isUndef() || (i >= e / 2);
         AllUndefOrZero &= isNullConstantOrUndef(Op);
       }
       if (AllUndefOrZero) {
@@ -46601,8 +46706,8 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
 
   // Try to remove a bitcast of constant vXi1 vector. We have to legalize
   // most of these to scalar anyway.
-  if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
-      SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+  if (Subtarget.hasAVX512() && VT.isScalarInteger() && SrcVT.isVector() &&
+      SrcVT.getVectorElementType() == MVT::i1 &&
       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
     return combinevXi1ConstantToInteger(N0, DAG);
   }
@@ -46620,8 +46725,8 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
   // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
   // Turn it into a sign bit compare that produces a k-register. This avoids
   // a trip through a GPR.
-  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
-      VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
+      VT.getVectorElementType() == MVT::i1 &&
       isPowerOf2_32(VT.getVectorNumElements())) {
     unsigned NumElts = VT.getVectorNumElements();
     SDValue Src = N0;
@@ -46675,12 +46780,12 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
   // transferring the SSE operand to integer register and back.
   unsigned FPOpcode;
   switch (N0.getOpcode()) {
-  // clang-format off
+    // clang-format off
   case ISD::AND: FPOpcode = X86ISD::FAND; break;
   case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
   case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
   default: return SDValue();
-  // clang-format on
+    // clang-format on
   }
 
   // Check if we have a bitcast from another integer type as well.
@@ -46781,7 +46886,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
   // Actually build the DotProduct, split as 256/512 bits for
   // AVXVNNI/AVX512VNNI.
   auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
-                       ArrayRef<SDValue> Ops) {
+                      ArrayRef<SDValue> Ops) {
     MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
     return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
   };
@@ -46896,7 +47001,8 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
                      DAG.getVectorIdxConstant(0, DL));
 }
 
-// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
+// Attempt to replace an all_of/any_of/parity style horizontal reduction with a
+// MOVMSK.
 static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
   // Bail without SSE2.
@@ -47171,9 +47277,9 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   if (Stages > 3) {
     unsigned SadElems = SadVT.getVectorNumElements();
 
-    for(unsigned i = Stages - 3; i > 0; --i) {
+    for (unsigned i = Stages - 3; i > 0; --i) {
       SmallVector<int, 16> Mask(SadElems, -1);
-      for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
+      for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
         Mask[j] = MaskEnd + j;
 
       SDValue Shuffle =
@@ -47489,10 +47595,10 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
                                Vec.getOperand(0).getValueType().getScalarType(),
                                Vec.getOperand(0), Index);
-    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
-                               Vec.getOperand(1), Index);
-    SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
-                               Vec.getOperand(2), Index);
+    SDValue Ext1 =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Vec.getOperand(1), Index);
+    SDValue Ext2 =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Vec.getOperand(2), Index);
     return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
   }
 
@@ -47772,8 +47878,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
     }
 
-    // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
-    // Improves lowering of bool masks on rust which splits them into byte array.
+    // Convert extract_element(bitcast(<X x i1>) ->
+    // bitcast(extract_subvector()). Improves lowering of bool masks on rust
+    // which splits them into byte array.
     if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
       SDValue Src = peekThroughBitcasts(InputVector);
       if (Src.getValueType().getScalarType() == MVT::i1 &&
@@ -48123,8 +48230,7 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const X86Subtarget &Subtarget) {
   SDValue Cond = N->getOperand(0);
-  if ((N->getOpcode() != ISD::VSELECT &&
-       N->getOpcode() != X86ISD::BLENDV) ||
+  if ((N->getOpcode() != ISD::VSELECT && N->getOpcode() != X86ISD::BLENDV) ||
       ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
     return SDValue();
 
@@ -48397,7 +48503,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
     // Check for x CC y ? x : y.
     if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
       switch (CC) {
-      default: break;
+      default:
+        break;
       case ISD::SETULT:
         // Converting this to a min would handle NaNs incorrectly, and swapping
         // the operands would cause it to handle comparisons between positive
@@ -48462,10 +48569,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         Opcode = X86ISD::FMAX;
         break;
       }
-    // Check for x CC y ? y : x -- a min/max with reversed arms.
+      // Check for x CC y ? y : x -- a min/max with reversed arms.
     } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
       switch (CC) {
-      default: break;
+      default:
+        break;
       case ISD::SETOGE:
         // Converting this to a min would handle comparisons between positive
         // and negative zero incorrectly, and swapping the operands would
@@ -48669,13 +48777,13 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
           Cond1 == InnerSetCC.getOperand(1)) {
         ISD::CondCode NewCC;
         switch (CC == ISD::SETEQ ? InnerCC : CC) {
-        // clang-format off
+          // clang-format off
         case ISD::SETGT:  NewCC = ISD::SETGE; break;
         case ISD::SETLT:  NewCC = ISD::SETLE; break;
         case ISD::SETUGT: NewCC = ISD::SETUGE; break;
         case ISD::SETULT: NewCC = ISD::SETULE; break;
         default: NewCC = ISD::SETCC_INVALID; break;
-        // clang-format on
+          // clang-format on
         }
         if (NewCC != ISD::SETCC_INVALID) {
           Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
@@ -48845,9 +48953,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
     // 16-bit lacks a proper blendv.
     unsigned EltBitWidth = VT.getScalarSizeInBits();
     bool CanShiftBlend =
-        TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
-                                (Subtarget.hasAVX2() && EltBitWidth == 64) ||
-                                (Subtarget.hasXOP()));
+        TLI.isTypeLegal(VT) &&
+        ((Subtarget.hasAVX() && EltBitWidth == 32) ||
+         (Subtarget.hasAVX2() && EltBitWidth == 64) || (Subtarget.hasXOP()));
     if (CanShiftBlend &&
         ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
           return C->getAPIntValue().isPowerOf2();
@@ -49086,7 +49194,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   SDValue Op2 = Cmp.getOperand(1);
 
   SDValue SetCC;
-  const ConstantSDNode* C = nullptr;
+  const ConstantSDNode *C = nullptr;
   bool needOppositeCond = (CC == X86::COND_E);
   bool checkAgainstTrue = false; // Is it a comparison against 1?
 
@@ -49107,8 +49215,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   bool truncatedToBoolWithAnd = false;
   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
-         SetCC.getOpcode() == ISD::TRUNCATE ||
-         SetCC.getOpcode() == ISD::AND) {
+         SetCC.getOpcode() == ISD::TRUNCATE || SetCC.getOpcode() == ISD::AND) {
     if (SetCC.getOpcode() == ISD::AND) {
       int OpIdx = -1;
       if (isOneConstant(SetCC.getOperand(0)))
@@ -49151,13 +49258,13 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
     if (!FVal) {
       SDValue Op = SetCC.getOperand(0);
       // Skip 'zext' or 'trunc' node.
-      if (Op.getOpcode() == ISD::ZERO_EXTEND ||
-          Op.getOpcode() == ISD::TRUNCATE)
+      if (Op.getOpcode() == ISD::ZERO_EXTEND || Op.getOpcode() == ISD::TRUNCATE)
         Op = Op.getOperand(0);
       // A special case for rdrand/rdseed, where 0 is set if false cond is
       // found.
       if ((Op.getOpcode() != X86ISD::RDRAND &&
-           Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
+           Op.getOpcode() != X86ISD::RDSEED) ||
+          Op.getResNo() != 0)
         return SDValue();
     }
     // Quit if false value is not the constant 0 or 1.
@@ -49202,7 +49309,8 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
 
   SDValue SetCC0, SetCC1;
   switch (Cond->getOpcode()) {
-  default: return false;
+  default:
+    return false;
   case ISD::AND:
   case X86ISD::AND:
     isAnd = true;
@@ -49267,8 +49375,7 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
         }
         // If this is a check of the z flag of an add with 1, switch to the
         // C flag.
-        if (CarryCC == X86::COND_E &&
-            CarryOp1.getOpcode() == X86ISD::ADD &&
+        if (CarryCC == X86::COND_E && CarryOp1.getOpcode() == X86ISD::ADD &&
             isOneConstant(CarryOp1.getOperand(1)))
           return CarryOp1;
       } else if (FoundAndLSB) {
@@ -49801,12 +49908,11 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
 
       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
       // for any integer data type, including i8/i16.
-      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+      if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
         Cond = getSETCC(CC, Cond, DL, DAG);
 
         // Zero extend the condition if needed.
-        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
-                           FalseC->getValueType(0), Cond);
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                            SDValue(FalseC, 0));
         return Cond;
@@ -49822,24 +49928,25 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
         bool isFastMultiplier = false;
         if (Diff.ult(10)) {
           switch (Diff.getZExtValue()) {
-          default: break;
-          case 1:  // result = add base, cond
-          case 2:  // result = lea base(    , cond*2)
-          case 3:  // result = lea base(cond, cond*2)
-          case 4:  // result = lea base(    , cond*4)
-          case 5:  // result = lea base(cond, cond*4)
-          case 8:  // result = lea base(    , cond*8)
-          case 9:  // result = lea base(cond, cond*8)
+          default:
+            break;
+          case 1: // result = add base, cond
+          case 2: // result = lea base(    , cond*2)
+          case 3: // result = lea base(cond, cond*2)
+          case 4: // result = lea base(    , cond*4)
+          case 5: // result = lea base(cond, cond*4)
+          case 8: // result = lea base(    , cond*8)
+          case 9: // result = lea base(cond, cond*8)
             isFastMultiplier = true;
             break;
           }
         }
 
         if (isFastMultiplier) {
-          Cond = getSETCC(CC, Cond, DL ,DAG);
+          Cond = getSETCC(CC, Cond, DL, DAG);
           // Zero extend the condition if needed.
-          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
-                             Cond);
+          Cond =
+              DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
           // Scale the condition by the difference.
           if (Diff != 1)
             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
@@ -50630,8 +50737,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG,
 
   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
   // since the result of setcc_c is all zero's or all ones.
-  if (VT.isInteger() && !VT.isVector() &&
-      N1C && N0.getOpcode() == ISD::AND &&
+  if (VT.isInteger() && !VT.isVector() && N1C && N0.getOpcode() == ISD::AND &&
       N0.getOperand(1).getOpcode() == ISD::Constant) {
     SDValue N00 = N0.getOperand(0);
     APInt Mask = N0.getConstantOperandAPInt(1);
@@ -50715,7 +50821,7 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
   if (SraConst.isNegative())
     return SDValue();
 
-  for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
+  for (MVT SVT : {MVT::i8, MVT::i16, MVT::i32}) {
     unsigned ShiftSize = SVT.getSizeInBits();
     // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
@@ -51049,8 +51155,8 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
 
   // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
   // truncate to create a larger truncate.
-  if (Subtarget.hasAVX512() &&
-      N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
+  if (Subtarget.hasAVX512() && N0.getOpcode() == ISD::TRUNCATE &&
+      N1.isUndef() && VT == MVT::v16i8 &&
       N0.getOperand(0).getValueType() == MVT::v8i32) {
     if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
         (!IsSigned &&
@@ -51397,7 +51503,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
 
     SDValue CMP00 = CMP0->getOperand(0);
     SDValue CMP01 = CMP0->getOperand(1);
-    EVT     VT    = CMP00.getValueType();
+    EVT VT = CMP00.getValueType();
 
     if (VT == MVT::f32 || VT == MVT::f64 ||
         (VT == MVT::f16 && Subtarget.hasFP16())) {
@@ -51423,8 +51529,10 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
       }
 
       if (!ExpectingFlags) {
-        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
-        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+        enum X86::CondCode cc0 =
+            (enum X86::CondCode)N0.getConstantOperandVal(0);
+        enum X86::CondCode cc1 =
+            (enum X86::CondCode)N1.getConstantOperandVal(0);
 
         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
           X86::CondCode tmp = cc0;
@@ -51432,7 +51540,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
           cc1 = tmp;
         }
 
-        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
+        if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
           // FIXME: need symbolic constants for these magic numbers.
           // See X86ATTInstPrinter.cpp:printSSECC().
@@ -51442,7 +51550,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
                             DAG.getTargetConstant(x86cc, DL, MVT::i8));
             // Need to fill with zeros to ensure the bitcast will produce zeroes
-            // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+            // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee
+            // that.
             SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
                                       DAG.getConstant(0, DL, MVT::v16i1),
                                       FSetCC, DAG.getVectorIdxConstant(0, DL));
@@ -51474,8 +51583,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
                                       DAG.getConstant(1, DL, IntVT));
-          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
-                                              ANDed);
+          SDValue OneBitOfTruth =
+              DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
           return OneBitOfTruth;
         }
       }
@@ -51670,7 +51779,8 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
   assert(VT.isVector() && "Expected vector type");
   assert((N.getOpcode() == ISD::ANY_EXTEND ||
           N.getOpcode() == ISD::ZERO_EXTEND ||
-          N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+          N.getOpcode() == ISD::SIGN_EXTEND) &&
+         "Invalid Node");
 
   SDValue Narrow = N.getOperand(0);
   EVT NarrowVT = Narrow.getValueType();
@@ -51680,26 +51790,27 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
   if (!Op)
     return SDValue();
   switch (N.getOpcode()) {
-  default: llvm_unreachable("Unexpected opcode");
+  default:
+    llvm_unreachable("Unexpected opcode");
   case ISD::ANY_EXTEND:
     return Op;
   case ISD::ZERO_EXTEND:
     return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
   case ISD::SIGN_EXTEND:
-    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
-                       Op, DAG.getValueType(NarrowVT));
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
+                       DAG.getValueType(NarrowVT));
   }
 }
 
 static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
   unsigned FPOpcode;
   switch (Opcode) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Unexpected input node for FP logic conversion");
   case ISD::AND: FPOpcode = X86ISD::FAND; break;
   case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
   case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
-  // clang-format on
+    // clang-format on
   }
   return FPOpcode;
 }
@@ -52142,8 +52253,7 @@ static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
   SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
                               DAG.getConstant(0, dl, SubVecVT));
   Ops[0] = SubVec;
-  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
-                               Ops);
+  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, Ops);
   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
   return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
 }
@@ -52492,7 +52602,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
-  if (SDValue R = combineAndNotIntoANDNP(N, dl ,DAG))
+  if (SDValue R = combineAndNotIntoANDNP(N, dl, DAG))
     return R;
 
   if (SDValue ShiftRight = combineAndMaskToShift(N, dl, DAG, Subtarget))
@@ -53268,7 +53378,8 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
 
         if (NotCond) {
           SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
-          R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
+          R = DAG.getNode(ISD::MUL, dl, VT, R,
+                          DAG.getConstant(Val + 1, dl, VT));
           R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
           return R;
         }
@@ -53405,7 +53516,7 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   switch (VT.getSimpleVT().SimpleTy) {
-  // clang-format off
+    // clang-format off
   default: return SDValue();
   case MVT::v16i8:
   case MVT::v8i16:
@@ -53535,8 +53646,8 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
   // split across two registers. We can use a packusdw+perm to clamp to 0-65535
   // and concatenate at the same time. Then we can use a final vpmovuswb to
   // clip to 0-255.
-  if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
-      InVT == MVT::v16i32 && VT == MVT::v16i8) {
+  if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && InVT == MVT::v16i32 &&
+      VT == MVT::v16i8) {
     if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
       // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
       SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
@@ -53552,11 +53663,12 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
   // FIXME: We could widen truncates to 512 to remove the VLX restriction.
   // If the result type is 256-bits or larger and we have disable 512-bit
   // registers, we should go ahead and use the pack instructions if possible.
-  bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
-                       (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
-                      (InVT.getSizeInBits() > 128) &&
-                      (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
-                      !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
+  bool PreferAVX512 =
+      ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
+       (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
+      (InVT.getSizeInBits() > 128) &&
+      (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
+      !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
 
   if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
       isPowerOf2_32(VT.getVectorNumElements()) &&
@@ -53569,8 +53681,8 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
                                              DAG, Subtarget);
         assert(Mid && "Failed to pack!");
-        SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
-                                           Subtarget);
+        SDValue V =
+            truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG, Subtarget);
         assert(V && "Failed to pack!");
         return V;
       } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
@@ -53894,10 +54006,9 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
     CastVT = VT.changeVectorElementType(EltVT);
   }
 
-  SDValue Load =
-      DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
-                  ML->getPointerInfo().getWithOffset(Offset),
-                  Alignment, ML->getMemOperand()->getFlags());
+  SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
+                             ML->getPointerInfo().getWithOffset(Offset),
+                             Alignment, ML->getMemOperand()->getFlags());
 
   SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
 
@@ -53928,8 +54039,8 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
   if (LoadFirstElt && LoadLastElt) {
     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
                                 ML->getMemOperand());
-    SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
-                                  ML->getPassThru());
+    SDValue Blend =
+        DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getPassThru());
     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
   }
 
@@ -53951,8 +54062,8 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
       VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
       DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
       ML->getAddressingMode(), ML->getExtensionType());
-  SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
-                                ML->getPassThru());
+  SDValue Blend =
+      DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getPassThru());
 
   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
 }
@@ -54032,8 +54143,8 @@ static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
 
   // Store that element at the appropriate offset from the base pointer.
   return DAG.getStore(MS->getChain(), DL, Extract, Addr,
-                      MS->getPointerInfo().getWithOffset(Offset),
-                      Alignment, MS->getMemOperand()->getFlags());
+                      MS->getPointerInfo().getWithOffset(Offset), Alignment,
+                      MS->getMemOperand()->getFlags());
 }
 
 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
@@ -54230,15 +54341,16 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
 
   // Turn vXi1 stores of constants into a scalar store.
   if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
-       VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
+       VT == MVT::v64i1) &&
+      VT == StVT && TLI.isTypeLegal(VT) &&
       ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
     // If its a v64i1 store without 64-bit support, we need two stores.
     if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
-      SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
-                                      StoredVal->ops().slice(0, 32));
+      SDValue Lo =
+          DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(0, 32));
       Lo = combinevXi1ConstantToInteger(Lo, DAG);
-      SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
-                                      StoredVal->ops().slice(32, 32));
+      SDValue Hi =
+          DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(32, 32));
       Hi = combinevXi1ConstantToInteger(Hi, DAG);
 
       SDValue Ptr0 = St->getBasePtr();
@@ -54338,9 +54450,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
       StoredVal.hasOneUse() &&
       TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
     bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
-    return EmitTruncSStore(IsSigned, St->getChain(),
-                           dl, StoredVal.getOperand(0), St->getBasePtr(),
-                           VT, St->getMemOperand(), DAG);
+    return EmitTruncSStore(IsSigned, St->getChain(), dl,
+                           StoredVal.getOperand(0), St->getBasePtr(), VT,
+                           St->getMemOperand(), DAG);
   }
 
   // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
@@ -54379,14 +54491,14 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   if (St->isTruncatingStore() && VT.isVector()) {
     if (TLI.isTruncStoreLegal(VT, StVT)) {
       if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
-        return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
-                               dl, Val, St->getBasePtr(),
-                               St->getMemoryVT(), St->getMemOperand(), DAG);
-      if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
-                                          DAG, dl))
+        return EmitTruncSStore(true /* Signed saturation */, St->getChain(), dl,
+                               Val, St->getBasePtr(), St->getMemoryVT(),
+                               St->getMemOperand(), DAG);
+      if (SDValue Val =
+              detectUSatPattern(St->getValue(), St->getMemoryVT(), DAG, dl))
         return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
-                               dl, Val, St->getBasePtr(),
-                               St->getMemoryVT(), St->getMemOperand(), DAG);
+                               dl, Val, St->getBasePtr(), St->getMemoryVT(),
+                               St->getMemOperand(), DAG);
     }
 
     return SDValue();
@@ -55194,8 +55306,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
 //                 (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
-                               const X86Subtarget &Subtarget,
-                               const SDLoc &DL) {
+                               const X86Subtarget &Subtarget, const SDLoc &DL) {
   using namespace SDPatternMatch;
   if (!VT.isVector() || !Subtarget.hasSSSE3())
     return SDValue();
@@ -55269,8 +55380,8 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
       std::swap(IdxN01, IdxN11);
     }
     // N0 indices be the even element. N1 indices must be the next odd element.
-    if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
-        IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
+    if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
+        IdxN11 != 2 * i + 1)
       return SDValue();
     SDValue N00In = N00Elt.getOperand(0);
     SDValue N01In = N01Elt.getOperand(0);
@@ -55281,8 +55392,8 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
       ZExtIn = N00In;
       SExtIn = N01In;
     }
-    if (ZExtIn != N00In || SExtIn != N01In ||
-        ZExtIn != N10In || SExtIn != N11In)
+    if (ZExtIn != N00In || SExtIn != N01In || ZExtIn != N10In ||
+        SExtIn != N11In)
       return SDValue();
   }
 
@@ -55302,14 +55413,13 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
     // Shrink by adding truncate nodes and let DAGCombine fold with the
     // sources.
     EVT InVT = Ops[0].getValueType();
-    assert(InVT.getScalarType() == MVT::i8 &&
-           "Unexpected scalar element type");
+    assert(InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type");
     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
                                  InVT.getVectorNumElements() / 2);
     return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
   };
-  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, {ZExtIn, SExtIn},
                           PMADDBuilder);
 }
 
@@ -55494,7 +55604,7 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
                                 bool NegRes) {
   if (NegMul) {
     switch (Opcode) {
-    // clang-format off
+      // clang-format off
     default: llvm_unreachable("Unexpected opcode");
     case ISD::FMA:              Opcode = X86ISD::FNMADD;        break;
     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FNMADD; break;
@@ -55508,13 +55618,13 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
     case X86ISD::FNMSUB:        Opcode = X86ISD::FMSUB;         break;
     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB;  break;
     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FMSUB_RND;     break;
-    // clang-format on
+      // clang-format on
     }
   }
 
   if (NegAcc) {
     switch (Opcode) {
-    // clang-format off
+      // clang-format off
     default: llvm_unreachable("Unexpected opcode");
     case ISD::FMA:              Opcode = X86ISD::FMSUB;         break;
     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FMSUB;  break;
@@ -55532,7 +55642,7 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
     case X86ISD::FMADDSUB_RND:  Opcode = X86ISD::FMSUBADD_RND;  break;
     case X86ISD::FMSUBADD:      Opcode = X86ISD::FMADDSUB;      break;
     case X86ISD::FMSUBADD_RND:  Opcode = X86ISD::FMADDSUB_RND;  break;
-    // clang-format on
+      // clang-format on
     }
   }
 
@@ -55549,7 +55659,7 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
     case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMSUB_RND;    break;
     case X86ISD::FNMSUB:       Opcode = ISD::FMA;             break;
     case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMADD_RND;    break;
-    // clang-format on
+      // clang-format on
     }
   }
 
@@ -55681,13 +55791,13 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
   SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
   unsigned IntOpcode;
   switch (N->getOpcode()) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Unexpected FP logic op");
   case X86ISD::FOR:   IntOpcode = ISD::OR; break;
   case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;
   case X86ISD::FAND:  IntOpcode = ISD::AND; break;
   case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
-  // clang-format on
+    // clang-format on
   }
   SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
   return DAG.getBitcast(VT, IntOp);
@@ -56039,13 +56149,18 @@ static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
   // into FMINC and FMAXC, which are Commutative operations.
   unsigned NewOp = 0;
   switch (N->getOpcode()) {
-    default: llvm_unreachable("unknown opcode");
-    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
-    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
+  default:
+    llvm_unreachable("unknown opcode");
+  case X86ISD::FMIN:
+    NewOp = X86ISD::FMINC;
+    break;
+  case X86ISD::FMAX:
+    NewOp = X86ISD::FMAXC;
+    break;
   }
 
-  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
-                     N->getOperand(0), N->getOperand(1));
+  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), N->getOperand(0),
+                     N->getOperand(1));
 }
 
 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
@@ -56091,8 +56206,8 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
     return SDValue();
 
-  EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
-                                         VT);
+  EVT SetCCType =
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 
   // There are 4 possibilities involving NaN inputs, and these are the required
   // outputs:
@@ -56142,8 +56257,8 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
       SDLoc dl(N);
-      SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
-                                    DAG.getBitcast(InVT, VZLoad));
+      SDValue Convert =
+          DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
       DCI.CombineTo(N, Convert);
       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
       DCI.recursivelyDeleteUnusedNodes(LN);
@@ -56638,8 +56753,8 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
 
   // Only combine legal element types.
   EVT SVT = VT.getVectorElementType();
-  if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
-      SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
+  if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 && SVT != MVT::i64 &&
+      SVT != MVT::f32 && SVT != MVT::f64)
     return SDValue();
 
   // We don't have CMPP Instruction for vxf16
@@ -56679,16 +56794,15 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   SDLoc DL(N);
 
   // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
-  if (!DCI.isBeforeLegalizeOps() &&
-      N0.getOpcode() == X86ISD::SETCC_CARRY) {
+  if (!DCI.isBeforeLegalizeOps() && N0.getOpcode() == X86ISD::SETCC_CARRY) {
     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
-                                 N0->getOperand(1));
+                                N0->getOperand(1));
     bool ReplaceOtherUses = !N0.hasOneUse();
     DCI.CombineTo(N, Setcc);
     // Replace other uses with a truncate of the widened setcc_carry.
     if (ReplaceOtherUses) {
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
-                                  N0.getValueType(), Setcc);
+      SDValue Trunc =
+          DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), Setcc);
       DCI.CombineTo(N0.getNode(), Trunc);
     }
 
@@ -56981,13 +57095,13 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
       N0.getOpcode() == X86ISD::SETCC_CARRY) {
     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
-                                 N0->getOperand(1));
+                                N0->getOperand(1));
     bool ReplaceOtherUses = !N0.hasOneUse();
     DCI.CombineTo(N, Setcc);
     // Replace other uses with a truncate of the widened setcc_carry.
     if (ReplaceOtherUses) {
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
-                                  N0.getValueType(), Setcc);
+      SDValue Trunc =
+          DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), Setcc);
       DCI.CombineTo(N0.getNode(), Trunc);
     }
 
@@ -57263,8 +57377,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
           if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
             SDValue BaseOp = LHS.getOperand(0);
             SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
-            SDValue SETCC1 = DAG.getSetCC(
-                DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
+            SDValue SETCC1 = DAG.getSetCC(DL, VT, BaseOp,
+                                          DAG.getConstant(-CInt, DL, OpVT), CC);
             return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
                                SETCC0, SETCC1);
           }
@@ -57624,19 +57738,25 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
   SDLoc DL(GorS);
 
   if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
-    SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
-                      Gather->getMask(), Base, Index, Scale } ;
-    return DAG.getMaskedGather(Gather->getVTList(),
-                               Gather->getMemoryVT(), DL, Ops,
-                               Gather->getMemOperand(),
+    SDValue Ops[] = {Gather->getChain(),
+                     Gather->getPassThru(),
+                     Gather->getMask(),
+                     Base,
+                     Index,
+                     Scale};
+    return DAG.getMaskedGather(Gather->getVTList(), Gather->getMemoryVT(), DL,
+                               Ops, Gather->getMemOperand(),
                                Gather->getIndexType(),
                                Gather->getExtensionType());
   }
   auto *Scatter = cast<MaskedScatterSDNode>(GorS);
-  SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
-                    Scatter->getMask(), Base, Index, Scale };
-  return DAG.getMaskedScatter(Scatter->getVTList(),
-                              Scatter->getMemoryVT(), DL,
+  SDValue Ops[] = {Scatter->getChain(),
+                   Scatter->getValue(),
+                   Scatter->getMask(),
+                   Base,
+                   Index,
+                   Scale};
+  return DAG.getMaskedScatter(Scatter->getVTList(), Scatter->getMemoryVT(), DL,
                               Ops, Scatter->getMemOperand(),
                               Scatter->getIndexType(),
                               Scatter->isTruncatingStore());
@@ -57867,8 +57987,8 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
       SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
     // The AND node needs bitcasts to/from an integer vector type around it.
     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
-    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
-                                 MaskConst);
+    SDValue NewAnd =
+        DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0), MaskConst);
     SDValue Res = DAG.getBitcast(VT, NewAnd);
     if (IsStrict)
       return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
@@ -58054,8 +58174,8 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
       // use CVTSI2P.
       assert(InVT == MVT::v2i64 && "Unexpected VT!");
       SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
-      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
-                                          { 0, 2, -1, -1 });
+      SDValue Shuf =
+          DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast, {0, 2, -1, -1});
       if (IsStrict)
         return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
                            {N->getOperand(0), Shuf});
@@ -58156,7 +58276,7 @@ static bool needCarryOrOverflowFlag(SDValue Flags) {
     }
 
     switch (CC) {
-    // clang-format off
+      // clang-format off
     default: break;
     case X86::COND_A: case X86::COND_AE:
     case X86::COND_B: case X86::COND_BE:
@@ -58164,7 +58284,7 @@ static bool needCarryOrOverflowFlag(SDValue Flags) {
     case X86::COND_G: case X86::COND_GE:
     case X86::COND_L: case X86::COND_LE:
       return true;
-    // clang-format on
+      // clang-format on
     }
   }
 
@@ -58300,11 +58420,12 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG,
 
   // After this the truncate and arithmetic op must have a single use.
   if (!Trunc.hasOneUse() || !Op.hasOneUse())
-      return SDValue();
+    return SDValue();
 
   unsigned NewOpc;
   switch (Op.getOpcode()) {
-  default: return SDValue();
+  default:
+    return SDValue();
   case ISD::AND:
     // Skip and with constant. We have special handling for and with immediate
     // during isel to generate test instructions.
@@ -58312,8 +58433,12 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG,
       return SDValue();
     NewOpc = X86ISD::AND;
     break;
-  case ISD::OR:  NewOpc = X86ISD::OR;  break;
-  case ISD::XOR: NewOpc = X86ISD::XOR; break;
+  case ISD::OR:
+    NewOpc = X86ISD::OR;
+    break;
+  case ISD::XOR:
+    NewOpc = X86ISD::XOR;
+    break;
   case ISD::ADD:
     // If the carry or overflow flag is used, we can't truncate.
     if (needCarryOrOverflowFlag(SDValue(N, 0)))
@@ -58490,9 +58615,8 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
-                            const SDLoc &DL, EVT VT,
-                            const X86Subtarget &Subtarget) {
+static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL,
+                            EVT VT, const X86Subtarget &Subtarget) {
   using namespace SDPatternMatch;
 
   // Example of pattern we try to detect:
@@ -58600,9 +58724,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
 // Attempt to turn this pattern into PMADDWD.
 // (add (mul (sext (build_vector)), (sext (build_vector))),
 //      (mul (sext (build_vector)), (sext (build_vector)))
-static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N,
-                              const SDLoc &DL, EVT VT,
-                              const X86Subtarget &Subtarget) {
+static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL,
+                              EVT VT, const X86Subtarget &Subtarget) {
   using namespace SDPatternMatch;
 
   if (!Subtarget.hasSSE2())
@@ -58698,7 +58821,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N,
   // If the output is narrower than an input, extract the low part of the input
   // vector.
   EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
-                               VT.getVectorNumElements() * 2);
+                                 VT.getVectorNumElements() * 2);
   if (OutVT16.bitsLT(In0.getValueType())) {
     In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
                       DAG.getVectorIdxConstant(0, DL));
@@ -58707,8 +58830,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N,
     In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
                       DAG.getVectorIdxConstant(0, DL));
   }
-  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
-                          PMADDBuilder);
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, {In0, In1}, PMADDBuilder);
 }
 
 // ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
@@ -59677,8 +59799,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
         unsigned Imm1 = Ops[1].getConstantOperandVal(2);
         // TODO: Handle zero'd subvectors.
         if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
-          int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
-                         (int)((Imm1 >> 4) & 0x3)};
+          int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3),
+                         (int)(Imm1 & 0x03), (int)((Imm1 >> 4) & 0x3)};
           MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
           SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
                                          Ops[0].getOperand(1), DAG, DL);
@@ -59866,8 +59988,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
             break;
         }
 
-        ISD::CondCode ICC =
-            Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
+        ISD::CondCode ICC = Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
         ISD::CondCode FCC =
             Opcode == X86ISD::PCMPEQ ? ISD::SETOEQ : ISD::SETOGT;
 
@@ -60217,7 +60338,8 @@ static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
     APInt Constant = APInt::getZero(VT.getSizeInBits());
     for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
       auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
-      if (!C) break;
+      if (!C)
+        break;
       Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
       if (I == (E - 1)) {
         EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
@@ -60290,9 +60412,9 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
           ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
           Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
               SubVecVT.getFixedSizeInBits())
-          return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
-                             getZeroVector(OpVT, Subtarget, DAG, dl),
-                             Ins.getOperand(1), N->getOperand(2));
+        return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+                           getZeroVector(OpVT, Subtarget, DAG, dl),
+                           Ins.getOperand(1), N->getOperand(2));
     }
   }
 
@@ -60982,7 +61104,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
       LHS.getOperand(0).getValueType() == MVT::v4i32) {
     SDLoc dl(N);
     LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
-                               LHS.getOperand(0), { 0, -1, 1, -1 });
+                               LHS.getOperand(0), {0, -1, 1, -1});
     LHS = DAG.getBitcast(MVT::v2i64, LHS);
     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
   }
@@ -60992,7 +61114,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
       RHS.getOperand(0).getValueType() == MVT::v4i32) {
     SDLoc dl(N);
     RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
-                               RHS.getOperand(0), { 0, -1, 1, -1 });
+                               RHS.getOperand(0), {0, -1, 1, -1});
     RHS = DAG.getBitcast(MVT::v2i64, RHS);
     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
   }
@@ -61263,16 +61385,16 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
   // Widen to at least 8 input elements.
   if (NumElts < 8) {
     unsigned NumConcats = 8 / NumElts;
-    SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
-                                : DAG.getConstant(0, dl, IntVT);
+    SDValue Fill =
+        NumElts == 4 ? DAG.getUNDEF(IntVT) : DAG.getConstant(0, dl, IntVT);
     SmallVector<SDValue, 4> Ops(NumConcats, Fill);
     Ops[0] = Src;
     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
   }
 
   // Destination is vXf32 with at least 4 elements.
-  EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
-                               std::max(4U, NumElts));
+  EVT CvtVT =
+      EVT::getVectorVT(*DAG.getContext(), MVT::f32, std::max(4U, NumElts));
   SDValue Cvt, Chain;
   if (IsStrict) {
     Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
@@ -61542,7 +61664,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
-  // clang-format off
+    // clang-format off
   default: break;
   case ISD::SCALAR_TO_VECTOR:
     return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
@@ -61893,7 +62015,8 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
 
   bool Commute = false;
   switch (Op.getOpcode()) {
-  default: return false;
+  default:
+    return false;
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
@@ -61933,8 +62056,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
         ((Commute && !isa<ConstantSDNode>(N1)) ||
          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
       return false;
-    if (IsFoldableAtomicRMW(N0, Op) ||
-        (Commute && IsFoldableAtomicRMW(N1, Op)))
+    if (IsFoldableAtomicRMW(N0, Op) || (Commute && IsFoldableAtomicRMW(N1, Op)))
       return false;
   }
   }
@@ -62021,8 +62143,7 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
     default:
       break;
     }
-  }
-  else if (Constraint.size() == 2) {
+  } else if (Constraint.size() == 2) {
     switch (Constraint[0]) {
     default:
       break;
@@ -62211,8 +62332,7 @@ X86TargetLowering::getSingleConstraintMatchWeight(
 /// Try to replace an X constraint, which matches anything, with another that
 /// has more specific requirements based on the type of the corresponding
 /// operand.
-const char *X86TargetLowering::
-LowerXConstraint(EVT ConstraintVT) const {
+const char *X86TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   // 'f' like normal targets.
   if (ConstraintVT.isFloatingPoint()) {
@@ -62258,7 +62378,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   SDValue Result;
   char ConstraintLetter = Constraint[0];
   switch (ConstraintLetter) {
-  default: break;
+  default:
+    break;
   case 'I':
     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 31) {
@@ -62332,8 +62453,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
         break;
       }
-    // FIXME gcc accepts some relocatable values here too, but only in certain
-    // memory models; it's complicated.
+      // FIXME gcc accepts some relocatable values here too, but only in certain
+      // memory models; it's complicated.
     }
     return;
   }
@@ -62376,8 +62497,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
       bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
       BooleanContent BCont = getBooleanContents(MVT::i64);
-      ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
-                                    : ISD::SIGN_EXTEND;
+      ISD::NodeType ExtOpc =
+          IsBool ? getExtendForContent(BCont) : ISD::SIGN_EXTEND;
       int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
                                                   : CST->getSExtValue();
       Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
@@ -62456,7 +62577,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   if (Constraint.size() == 1) {
     // GCC Constraint Letters
     switch (Constraint[0]) {
-    default: break;
+    default:
+      break;
     // 'A' means [ER]AX + [ER]DX.
     case 'A':
       if (Subtarget.is64Bit())
@@ -62484,7 +62606,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
           return std::make_pair(0U, &X86::VK64RegClass);
       }
       break;
-    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+    case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
       if (Subtarget.is64Bit()) {
         if (VT == MVT::i8 || VT == MVT::i1)
           return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
@@ -62506,7 +62628,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       }
       [[fallthrough]];
       // 32-bit fallthrough
-    case 'Q':   // Q_REGS
+    case 'Q': // Q_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
       if (VT == MVT::i16)
@@ -62517,8 +62639,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       if (VT != MVT::f80 && !VT.isVector())
         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
       break;
-    case 'r':   // GENERAL_REGS
-    case 'l':   // INDEX_REGS
+    case 'r': // GENERAL_REGS
+    case 'l': // INDEX_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
                                       ? &X86::GR8RegClass
@@ -62537,7 +62659,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                       ? &X86::GR64RegClass
                                       : &X86::GR64_NOREX2RegClass);
       break;
-    case 'R':   // LEGACY_REGS
+    case 'R': // LEGACY_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
       if (VT == MVT::i16)
@@ -62548,7 +62670,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       if (VT != MVT::f80 && !VT.isVector())
         return std::make_pair(0U, &X86::GR64_NOREXRegClass);
       break;
-    case 'f':  // FP Stack registers.
+    case 'f': // FP Stack registers.
       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
       // value to the correct fpstack register class.
       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
@@ -62558,16 +62680,19 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
         return std::make_pair(0U, &X86::RFP80RegClass);
       break;
-    case 'y':   // MMX_REGS if MMX allowed.
-      if (!Subtarget.hasMMX()) break;
+    case 'y': // MMX_REGS if MMX allowed.
+      if (!Subtarget.hasMMX())
+        break;
       return std::make_pair(0U, &X86::VR64RegClass);
     case 'v':
-    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
-      if (!Subtarget.hasSSE1()) break;
+    case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+      if (!Subtarget.hasSSE1())
+        break;
       bool VConstraint = (Constraint[0] == 'v');
 
       switch (VT.SimpleTy) {
-      default: break;
+      default:
+        break;
       // Scalar SSE types.
       case MVT::f16:
         if (VConstraint && Subtarget.hasFP16())
@@ -62658,7 +62783,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       case MVT::v16f32:
       case MVT::v16i32:
       case MVT::v8i64:
-        if (!Subtarget.hasAVX512()) break;
+        if (!Subtarget.hasAVX512())
+          break;
         if (VConstraint)
           return std::make_pair(0U, &X86::VR512RegClass);
         return std::make_pair(0U, &X86::VR512_0_15RegClass);
@@ -62674,12 +62800,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     case '2':
       return getRegForInlineAsmConstraint(TRI, "x", VT);
     case 'm':
-      if (!Subtarget.hasMMX()) break;
+      if (!Subtarget.hasMMX())
+        break;
       return std::make_pair(0U, &X86::VR64RegClass);
     case 'z':
-      if (!Subtarget.hasSSE1()) break;
+      if (!Subtarget.hasSSE1())
+        break;
       switch (VT.SimpleTy) {
-      default: break;
+      default:
+        break;
       // Scalar SSE types.
       case MVT::f16:
         if (!Subtarget.hasFP16())
@@ -62794,14 +62923,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
 
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
-  std::pair<Register, const TargetRegisterClass*> Res;
+  std::pair<Register, const TargetRegisterClass *> Res;
   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 
   // Not found as a standard register?
   if (!Res.second) {
     // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
     // to/from f80.
-    if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
+    if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 ||
+        VT == MVT::f80) {
       // Map st(0) -> st(7) -> ST0
       if (Constraint.size() == 7 && Constraint[0] == '{' &&
           tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
@@ -62859,7 +62989,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   // turn into {ax},{dx}.
   // MVT::Other is used to specify clobber names.
   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
-    return Res;   // Correct type already, nothing to do.
+    return Res; // Correct type already, nothing to do.
 
   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
   // return "eax". This should even work for things like getting 64bit integer
@@ -62871,7 +63001,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   // Therefore, use a helper method.
   if (isGRClass(*Class)) {
     unsigned Size = VT.getSizeInBits();
-    if (Size == 1) Size = 8;
+    if (Size == 1)
+      Size = 8;
     if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
       return std::make_pair(0, nullptr);
     Register DestReg = getX86SubSuperRegister(Res.first, Size);
@@ -62879,9 +63010,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       bool is64Bit = Subtarget.is64Bit();
       const TargetRegisterClass *RC =
           Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
-        : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
-        : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
-        : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
+          : Size == 16
+              ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
+          : Size == 32
+              ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
+              : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
       if (Size == 64 && !is64Bit) {
         // Model GCC's behavior here and select a fixed pair of 32-bit
         // registers.
@@ -63133,8 +63266,7 @@ X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {
   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
 }
 
-unsigned
-X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
+unsigned X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
   // The default stack probe size is 4096 if the function has no stackprobesize
   // attribute.
   return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index d759895719388..df3838fab4ae9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -18,1975 +18,1964 @@
 #include "llvm/CodeGen/TargetLowering.h"
 
 namespace llvm {
-  class X86Subtarget;
-  class X86TargetMachine;
-
-  namespace X86ISD {
-    // X86 Specific DAG Nodes
-  enum NodeType : unsigned {
-    // Start the numbering where the builtin ops leave off.
-    FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-    /// Bit scan forward.
-    BSF,
-    /// Bit scan reverse.
-    BSR,
-
-    /// X86 funnel/double shift i16 instructions. These correspond to
-    /// X86::SHLDW and X86::SHRDW instructions which have different amt
-    /// modulo rules to generic funnel shifts.
-    /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
-    FSHL,
-    FSHR,
-
-    /// Bitwise logical AND of floating point values. This corresponds
-    /// to X86::ANDPS or X86::ANDPD.
-    FAND,
-
-    /// Bitwise logical OR of floating point values. This corresponds
-    /// to X86::ORPS or X86::ORPD.
-    FOR,
-
-    /// Bitwise logical XOR of floating point values. This corresponds
-    /// to X86::XORPS or X86::XORPD.
-    FXOR,
-
-    ///  Bitwise logical ANDNOT of floating point values. This
-    /// corresponds to X86::ANDNPS or X86::ANDNPD.
-    FANDN,
-
-    /// These operations represent an abstract X86 call
-    /// instruction, which includes a bunch of information.  In particular the
-    /// operands of these node are:
-    ///
-    ///     #0 - The incoming token chain
-    ///     #1 - The callee
-    ///     #2 - The number of arg bytes the caller pushes on the stack.
-    ///     #3 - The number of arg bytes the callee pops off the stack.
-    ///     #4 - The value to pass in AL/AX/EAX (optional)
-    ///     #5 - The value to pass in DL/DX/EDX (optional)
-    ///
-    /// The result values of these nodes are:
-    ///
-    ///     #0 - The outgoing token chain
-    ///     #1 - The first register result value (optional)
-    ///     #2 - The second register result value (optional)
-    ///
-    CALL,
-
-    /// Same as call except it adds the NoTrack prefix.
-    NT_CALL,
-
-    // Pseudo for a OBJC call that gets emitted together with a special
-    // marker instruction.
-    CALL_RVMARKER,
-
-    /// The same as ISD::CopyFromReg except that this node makes it explicit
-    /// that it may lower to an x87 FPU stack pop. Optimizations should be more
-    /// cautious when handling this node than a normal CopyFromReg to avoid
-    /// removing a required FPU stack pop. A key requirement is optimizations
-    /// should not optimize any users of a chain that contains a
-    /// POP_FROM_X87_REG to use a chain from a point earlier than the
-    /// POP_FROM_X87_REG (which may remove a required FPU stack pop).
-    POP_FROM_X87_REG,
-
-    // Pseudo for a call to an imported function to ensure the correct machine
-    // instruction is emitted for Import Call Optimization.
-    IMP_CALL,
-
-    /// X86 compare and logical compare instructions.
-    CMP,
-    FCMP,
-    COMI,
-    UCOMI,
-
-    // X86 compare with Intrinsics similar to COMI.
-    COMX,
-    UCOMX,
-
-    /// X86 bit-test instructions.
-    BT,
-
-    /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
-    /// operand, usually produced by a CMP instruction.
-    SETCC,
-
-    /// X86 Select
-    SELECTS,
-
-    /// X86 Constant-time Select, implemented with CMOV instruction. This is
-    /// used to implement constant-time select.
-    CTSELECT,
-
-    // Same as SETCC except it's materialized with a sbb and the value is all
-    // one's or all zero's.
-    SETCC_CARRY, // R = carry_bit ? ~0 : 0
-
-    /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
-    /// Operands are two FP values to compare; result is a mask of
-    /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
-    FSETCC,
-
-    /// X86 FP SETCC, similar to above, but with output as an i1 mask and
-    /// and a version with SAE.
-    FSETCCM,
-    FSETCCM_SAE,
-
-    /// X86 conditional moves. Operand 0 and operand 1 are the two values
-    /// to select from. Operand 2 is the condition code, and operand 3 is the
-    /// flag operand produced by a CMP or TEST instruction.
-    CMOV,
-
-    /// X86 conditional branches. Operand 0 is the chain operand, operand 1
-    /// is the block to branch if condition is true, operand 2 is the
-    /// condition code, and operand 3 is the flag operand produced by a CMP
-    /// or TEST instruction.
-    BRCOND,
-
-    /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
-    /// operand 1 is the target address.
-    NT_BRIND,
-
-    /// Return with a glue operand. Operand 0 is the chain operand, operand
-    /// 1 is the number of bytes of stack to pop.
-    RET_GLUE,
-
-    /// Return from interrupt. Operand 0 is the number of bytes to pop.
-    IRET,
-
-    /// Repeat fill, corresponds to X86::REP_STOSx.
-    REP_STOS,
-
-    /// Repeat move, corresponds to X86::REP_MOVSx.
-    REP_MOVS,
-
-    /// On Darwin, this node represents the result of the popl
-    /// at function entry, used for PIC code.
-    GlobalBaseReg,
-
-    /// A wrapper node for TargetConstantPool, TargetJumpTable,
-    /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
-    /// MCSymbol and TargetBlockAddress.
-    Wrapper,
-
-    /// Special wrapper used under X86-64 PIC mode for RIP
-    /// relative displacements.
-    WrapperRIP,
-
-    /// Copies a 64-bit value from an MMX vector to the low word
-    /// of an XMM vector, with the high word zero filled.
-    MOVQ2DQ,
-
-    /// Copies a 64-bit value from the low word of an XMM vector
-    /// to an MMX vector.
-    MOVDQ2Q,
-
-    /// Copies a 32-bit value from the low word of a MMX
-    /// vector to a GPR.
-    MMX_MOVD2W,
-
-    /// Copies a GPR into the low 32-bit word of a MMX vector
-    /// and zero out the high word.
-    MMX_MOVW2D,
-
-    /// Extract an 8-bit value from a vector and zero extend it to
-    /// i32, corresponds to X86::PEXTRB.
-    PEXTRB,
-
-    /// Extract a 16-bit value from a vector and zero extend it to
-    /// i32, corresponds to X86::PEXTRW.
-    PEXTRW,
-
-    /// Insert any element of a 4 x float vector into any element
-    /// of a destination 4 x floatvector.
-    INSERTPS,
-
-    /// Insert the lower 8-bits of a 32-bit value to a vector,
-    /// corresponds to X86::PINSRB.
-    PINSRB,
-
-    /// Insert the lower 16-bits of a 32-bit value to a vector,
-    /// corresponds to X86::PINSRW.
-    PINSRW,
-
-    /// Shuffle 16 8-bit values within a vector.
-    PSHUFB,
-
-    /// Compute Sum of Absolute Differences.
-    PSADBW,
-    /// Compute Double Block Packed Sum-Absolute-Differences
-    DBPSADBW,
-
-    /// Bitwise Logical AND NOT of Packed FP values.
-    ANDNP,
-
-    /// Blend where the selector is an immediate.
-    BLENDI,
-
-    /// Dynamic (non-constant condition) vector blend where only the sign bits
-    /// of the condition elements are used. This is used to enforce that the
-    /// condition mask is not valid for generic VSELECT optimizations. This
-    /// is also used to implement the intrinsics.
-    /// Operands are in VSELECT order: MASK, TRUE, FALSE
-    BLENDV,
-
-    /// Combined add and sub on an FP vector.
-    ADDSUB,
-
-    //  FP vector ops with rounding mode.
-    FADD_RND,
-    FADDS,
-    FADDS_RND,
-    FSUB_RND,
-    FSUBS,
-    FSUBS_RND,
-    FMUL_RND,
-    FMULS,
-    FMULS_RND,
-    FDIV_RND,
-    FDIVS,
-    FDIVS_RND,
-    FMAX_SAE,
-    FMAXS_SAE,
-    FMIN_SAE,
-    FMINS_SAE,
-    FSQRT_RND,
-    FSQRTS,
-    FSQRTS_RND,
-
-    // FP vector get exponent.
-    FGETEXP,
-    FGETEXP_SAE,
-    FGETEXPS,
-    FGETEXPS_SAE,
-    // Extract Normalized Mantissas.
-    VGETMANT,
-    VGETMANT_SAE,
-    VGETMANTS,
-    VGETMANTS_SAE,
-    // FP Scale.
-    SCALEF,
-    SCALEF_RND,
-    SCALEFS,
-    SCALEFS_RND,
-
-    /// Integer horizontal add/sub.
-    HADD,
-    HSUB,
-
-    /// Floating point horizontal add/sub.
-    FHADD,
-    FHSUB,
-
-    // Detect Conflicts Within a Vector
-    CONFLICT,
-
-    /// Floating point max and min.
-    FMAX,
-    FMIN,
-
-    /// Commutative FMIN and FMAX.
-    FMAXC,
-    FMINC,
-
-    /// Scalar intrinsic floating point max and min.
-    FMAXS,
-    FMINS,
-
-    /// Floating point reciprocal-sqrt and reciprocal approximation.
-    /// Note that these typically require refinement
-    /// in order to obtain suitable precision.
-    FRSQRT,
-    FRCP,
-
-    // AVX-512 reciprocal approximations with a little more precision.
-    RSQRT14,
-    RSQRT14S,
-    RCP14,
-    RCP14S,
-
-    // Thread Local Storage.
-    TLSADDR,
-
-    // Thread Local Storage. A call to get the start address
-    // of the TLS block for the current module.
-    TLSBASEADDR,
-
-    // Thread Local Storage.  When calling to an OS provided
-    // thunk at the address from an earlier relocation.
-    TLSCALL,
-
-    // Thread Local Storage. A descriptor containing pointer to
-    // code and to argument to get the TLS offset for the symbol.
-    TLSDESC,
-
-    // Exception Handling helpers.
-    EH_RETURN,
-
-    // SjLj exception handling setjmp.
-    EH_SJLJ_SETJMP,
-
-    // SjLj exception handling longjmp.
-    EH_SJLJ_LONGJMP,
-
-    // SjLj exception handling dispatch.
-    EH_SJLJ_SETUP_DISPATCH,
-
-    /// Tail call return. See X86TargetLowering::LowerCall for
-    /// the list of operands.
-    TC_RETURN,
-
-    // Vector move to low scalar and zero higher vector elements.
-    VZEXT_MOVL,
-
-    // Vector integer truncate.
-    VTRUNC,
-    // Vector integer truncate with unsigned/signed saturation.
-    VTRUNCUS,
-    VTRUNCS,
-
-    // Masked version of the above. Used when less than a 128-bit result is
-    // produced since the mask only applies to the lower elements and can't
-    // be represented by a select.
-    // SRC, PASSTHRU, MASK
-    VMTRUNC,
-    VMTRUNCUS,
-    VMTRUNCS,
-
-    // Vector FP extend.
-    VFPEXT,
-    VFPEXT_SAE,
-    VFPEXTS,
-    VFPEXTS_SAE,
-
-    // Vector FP round.
-    VFPROUND,
-    // Convert TWO packed single data to one packed data
-    VFPROUND2,
-    VFPROUND2_RND,
-    VFPROUND_RND,
-    VFPROUNDS,
-    VFPROUNDS_RND,
-
-    // Masked version of above. Used for v2f64->v4f32.
-    // SRC, PASSTHRU, MASK
-    VMFPROUND,
-
-    // 128-bit vector logical left / right shift
-    VSHLDQ,
-    VSRLDQ,
-
-    // Vector shift elements
-    VSHL,
-    VSRL,
-    VSRA,
-
-    // Vector variable shift
-    VSHLV,
-    VSRLV,
-    VSRAV,
-
-    // Vector shift elements by immediate
-    VSHLI,
-    VSRLI,
-    VSRAI,
-
-    // Shifts of mask registers.
-    KSHIFTL,
-    KSHIFTR,
-
-    // Bit rotate by immediate
-    VROTLI,
-    VROTRI,
-
-    // Vector packed double/float comparison.
-    CMPP,
-
-    // Vector integer comparisons.
-    PCMPEQ,
-    PCMPGT,
-
-    // v8i16 Horizontal minimum and position.
-    PHMINPOS,
-
-    MULTISHIFT,
-
-    /// Vector comparison generating mask bits for fp and
-    /// integer signed and unsigned data types.
-    CMPM,
-    // Vector mask comparison generating mask bits for FP values.
-    CMPMM,
-    // Vector mask comparison with SAE for FP values.
-    CMPMM_SAE,
-
-    // Arithmetic operations with FLAGS results.
-    ADD,
-    SUB,
-    ADC,
-    SBB,
-    SMUL,
-    UMUL,
-    OR,
-    XOR,
-    AND,
-
-    // Bit field extract.
-    BEXTR,
-    BEXTRI,
-
-    // Zero High Bits Starting with Specified Bit Position.
-    BZHI,
-
-    // Parallel extract and deposit.
-    PDEP,
-    PEXT,
-
-    // X86-specific multiply by immediate.
-    MUL_IMM,
-
-    // Vector sign bit extraction.
-    MOVMSK,
-
-    // Vector bitwise comparisons.
-    PTEST,
-
-    // Vector packed fp sign bitwise comparisons.
-    TESTP,
-
-    // OR/AND test for masks.
-    KORTEST,
-    KTEST,
-
-    // ADD for masks.
-    KADD,
-
-    // Several flavors of instructions with vector shuffle behaviors.
-    // Saturated signed/unnsigned packing.
-    PACKSS,
-    PACKUS,
-    // Intra-lane alignr.
-    PALIGNR,
-    // AVX512 inter-lane alignr.
-    VALIGN,
-    PSHUFD,
-    PSHUFHW,
-    PSHUFLW,
-    SHUFP,
-    // VBMI2 Concat & Shift.
-    VSHLD,
-    VSHRD,
-
-    // Shuffle Packed Values at 128-bit granularity.
-    SHUF128,
-    MOVDDUP,
-    MOVSHDUP,
-    MOVSLDUP,
-    MOVLHPS,
-    MOVHLPS,
-    MOVSD,
-    MOVSS,
-    MOVSH,
-    UNPCKL,
-    UNPCKH,
-    VPERMILPV,
-    VPERMILPI,
-    VPERMI,
-    VPERM2X128,
-
-    // Variable Permute (VPERM).
-    // Res = VPERMV MaskV, V0
-    VPERMV,
-
-    // 3-op Variable Permute (VPERMT2).
-    // Res = VPERMV3 V0, MaskV, V1
-    VPERMV3,
-
-    // Bitwise ternary logic.
-    VPTERNLOG,
-    // Fix Up Special Packed Float32/64 values.
-    VFIXUPIMM,
-    VFIXUPIMM_SAE,
-    VFIXUPIMMS,
-    VFIXUPIMMS_SAE,
-    // Range Restriction Calculation For Packed Pairs of Float32/64 values.
-    VRANGE,
-    VRANGE_SAE,
-    VRANGES,
-    VRANGES_SAE,
-    // Reduce - Perform Reduction Transformation on scalar\packed FP.
-    VREDUCE,
-    VREDUCE_SAE,
-    VREDUCES,
-    VREDUCES_SAE,
-    // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
-    // Also used by the legacy (V)ROUND intrinsics where we mask out the
-    // scaling part of the immediate.
-    VRNDSCALE,
-    VRNDSCALE_SAE,
-    VRNDSCALES,
-    VRNDSCALES_SAE,
-    // Tests Types Of a FP Values for packed types.
-    VFPCLASS,
-    // Tests Types Of a FP Values for scalar types.
-    VFPCLASSS,
-
-    // Broadcast (splat) scalar or element 0 of a vector. If the operand is
-    // a vector, this node may change the vector length as part of the splat.
-    VBROADCAST,
-    // Broadcast mask to vector.
-    VBROADCASTM,
-
-    /// SSE4A Extraction and Insertion.
-    EXTRQI,
-    INSERTQI,
-
-    // XOP arithmetic/logical shifts.
-    VPSHA,
-    VPSHL,
-    // XOP signed/unsigned integer comparisons.
-    VPCOM,
-    VPCOMU,
-    // XOP packed permute bytes.
-    VPPERM,
-    // XOP two source permutation.
-    VPERMIL2,
-
-    // Vector multiply packed unsigned doubleword integers.
-    PMULUDQ,
-    // Vector multiply packed signed doubleword integers.
-    PMULDQ,
-    // Vector Multiply Packed UnsignedIntegers with Round and Scale.
-    MULHRS,
-
-    // Multiply and Add Packed Integers.
-    VPMADDUBSW,
-    VPMADDWD,
-
-    // AVX512IFMA multiply and add.
-    // NOTE: These are different than the instruction and perform
-    // op0 x op1 + op2.
-    VPMADD52L,
-    VPMADD52H,
-
-    // VNNI
-    VPDPBUSD,
-    VPDPBUSDS,
-    VPDPWSSD,
-    VPDPWSSDS,
-
-    // FMA nodes.
-    // We use the target independent ISD::FMA for the non-inverted case.
-    FNMADD,
-    FMSUB,
-    FNMSUB,
-    FMADDSUB,
-    FMSUBADD,
-
-    // FMA with rounding mode.
-    FMADD_RND,
-    FNMADD_RND,
-    FMSUB_RND,
-    FNMSUB_RND,
-    FMADDSUB_RND,
-    FMSUBADD_RND,
-
-    // AVX512-FP16 complex addition and multiplication.
-    VFMADDC,
-    VFMADDC_RND,
-    VFCMADDC,
-    VFCMADDC_RND,
-
-    VFMULC,
-    VFMULC_RND,
-    VFCMULC,
-    VFCMULC_RND,
-
-    VFMADDCSH,
-    VFMADDCSH_RND,
-    VFCMADDCSH,
-    VFCMADDCSH_RND,
-
-    VFMULCSH,
-    VFMULCSH_RND,
-    VFCMULCSH,
-    VFCMULCSH_RND,
-
-    VPDPBSUD,
-    VPDPBSUDS,
-    VPDPBUUD,
-    VPDPBUUDS,
-    VPDPBSSD,
-    VPDPBSSDS,
-
-    VPDPWSUD,
-    VPDPWSUDS,
-    VPDPWUSD,
-    VPDPWUSDS,
-    VPDPWUUD,
-    VPDPWUUDS,
-
-    VMINMAX,
-    VMINMAX_SAE,
-    VMINMAXS,
-    VMINMAXS_SAE,
-
-    CVTP2IBS,
-    CVTP2IUBS,
-    CVTP2IBS_RND,
-    CVTP2IUBS_RND,
-    CVTTP2IBS,
-    CVTTP2IUBS,
-    CVTTP2IBS_SAE,
-    CVTTP2IUBS_SAE,
-
-    MPSADBW,
-
-    VCVT2PH2BF8,
-    VCVT2PH2BF8S,
-    VCVT2PH2HF8,
-    VCVT2PH2HF8S,
-    VCVTBIASPH2BF8,
-    VCVTBIASPH2BF8S,
-    VCVTBIASPH2HF8,
-    VCVTBIASPH2HF8S,
-    VCVTPH2BF8,
-    VCVTPH2BF8S,
-    VCVTPH2HF8,
-    VCVTPH2HF8S,
-    VMCVTBIASPH2BF8,
-    VMCVTBIASPH2BF8S,
-    VMCVTBIASPH2HF8,
-    VMCVTBIASPH2HF8S,
-    VMCVTPH2BF8,
-    VMCVTPH2BF8S,
-    VMCVTPH2HF8,
-    VMCVTPH2HF8S,
-    VCVTHF82PH,
-
-    // Compress and expand.
-    COMPRESS,
-    EXPAND,
-
-    // Bits shuffle
-    VPSHUFBITQMB,
-
-    // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
-    SINT_TO_FP_RND,
-    UINT_TO_FP_RND,
-    SCALAR_SINT_TO_FP,
-    SCALAR_UINT_TO_FP,
-    SCALAR_SINT_TO_FP_RND,
-    SCALAR_UINT_TO_FP_RND,
-
-    // Vector float/double to signed/unsigned integer.
-    CVTP2SI,
-    CVTP2UI,
-    CVTP2SI_RND,
-    CVTP2UI_RND,
-    // Scalar float/double to signed/unsigned integer.
-    CVTS2SI,
-    CVTS2UI,
-    CVTS2SI_RND,
-    CVTS2UI_RND,
-
-    // Vector float/double to signed/unsigned integer with truncation.
-    CVTTP2SI,
-    CVTTP2UI,
-    CVTTP2SI_SAE,
-    CVTTP2UI_SAE,
-
-    // Saturation enabled Vector float/double to signed/unsigned
-    // integer with truncation.
-    CVTTP2SIS,
-    CVTTP2UIS,
-    CVTTP2SIS_SAE,
-    CVTTP2UIS_SAE,
-    // Masked versions of above. Used for v2f64 to v4i32.
-    // SRC, PASSTHRU, MASK
-    MCVTTP2SIS,
-    MCVTTP2UIS,
-
-    // Scalar float/double to signed/unsigned integer with truncation.
-    CVTTS2SI,
-    CVTTS2UI,
-    CVTTS2SI_SAE,
-    CVTTS2UI_SAE,
-
-    // Vector signed/unsigned integer to float/double.
-    CVTSI2P,
-    CVTUI2P,
-
-    // Scalar float/double to signed/unsigned integer with saturation.
-    CVTTS2SIS,
-    CVTTS2UIS,
-    CVTTS2SIS_SAE,
-    CVTTS2UIS_SAE,
-
-    // Masked versions of above. Used for v2f64->v4f32.
-    // SRC, PASSTHRU, MASK
-    MCVTP2SI,
-    MCVTP2UI,
-    MCVTTP2SI,
-    MCVTTP2UI,
-    MCVTSI2P,
-    MCVTUI2P,
-
-    // Custom handling for FP_TO_xINT_SAT
-    FP_TO_SINT_SAT,
-    FP_TO_UINT_SAT,
-
-    // Vector float to bfloat16.
-    // Convert packed single data to packed BF16 data
-    CVTNEPS2BF16,
-    // Masked version of above.
-    // SRC, PASSTHRU, MASK
-    MCVTNEPS2BF16,
-
-    // Dot product of BF16/FP16 pairs to accumulated into
-    // packed single precision.
-    DPBF16PS,
-    DPFP16PS,
-
-    // A stack checking function call. On Windows it's _chkstk call.
-    DYN_ALLOCA,
-
-    // For allocating variable amounts of stack space when using
-    // segmented stacks. Check if the current stacklet has enough space, and
-    // falls back to heap allocation if not.
-    SEG_ALLOCA,
-
-    // For allocating stack space when using stack clash protector.
-    // Allocation is performed by block, and each block is probed.
-    PROBED_ALLOCA,
-
-    // Memory barriers.
-    MFENCE,
-
-    // Get a random integer and indicate whether it is valid in CF.
-    RDRAND,
-
-    // Get a NIST SP800-90B & C compliant random integer and
-    // indicate whether it is valid in CF.
-    RDSEED,
-
-    // Protection keys
-    // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
-    // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
-    // value for ECX.
-    RDPKRU,
-    WRPKRU,
-
-    // SSE42 string comparisons.
-    // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
-    // will emit one or two instructions based on which results are used. If
-    // flags and index/mask this allows us to use a single instruction since
-    // we won't have to pick and opcode for flags. Instead we can rely on the
-    // DAG to CSE everything and decide at isel.
-    PCMPISTR,
-    PCMPESTR,
-
-    // Test if in transactional execution.
-    XTEST,
-
-    // Conversions between float and half-float.
-    CVTPS2PH,
-    CVTPS2PH_SAE,
-    CVTPH2PS,
-    CVTPH2PS_SAE,
-
-    // Masked version of above.
-    // SRC, RND, PASSTHRU, MASK
-    MCVTPS2PH,
-    MCVTPS2PH_SAE,
-
-    // Galois Field Arithmetic Instructions
-    GF2P8AFFINEINVQB,
-    GF2P8AFFINEQB,
-    GF2P8MULB,
-
-    // LWP insert record.
-    LWPINS,
-
-    // User level wait
-    UMWAIT,
-    TPAUSE,
-
-    // Enqueue Stores Instructions
-    ENQCMD,
-    ENQCMDS,
-
-    // For avx512-vp2intersect
-    VP2INTERSECT,
-
-    // User level interrupts - testui
-    TESTUI,
-
-    // Perform an FP80 add after changing precision control in FPCW.
-    FP80_ADD,
-
-    // Conditional compare instructions
-    CCMP,
-    CTEST,
-
-    /// X86 strict FP compare instructions.
-    FIRST_STRICTFP_OPCODE,
-    STRICT_FCMP = FIRST_STRICTFP_OPCODE,
-    STRICT_FCMPS,
-
-    // Vector packed double/float comparison.
-    STRICT_CMPP,
-
-    /// Vector comparison generating mask bits for fp and
-    /// integer signed and unsigned data types.
-    STRICT_CMPM,
-
-    // Vector float/double to signed/unsigned integer with truncation.
-    STRICT_CVTTP2SI,
-    STRICT_CVTTP2UI,
-
-    // Vector FP extend.
-    STRICT_VFPEXT,
-
-    // Vector FP round.
-    STRICT_VFPROUND,
-
-    // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
-    // Also used by the legacy (V)ROUND intrinsics where we mask out the
-    // scaling part of the immediate.
-    STRICT_VRNDSCALE,
-
-    // Vector signed/unsigned integer to float/double.
-    STRICT_CVTSI2P,
-    STRICT_CVTUI2P,
-
-    // Strict FMA nodes.
-    STRICT_FNMADD,
-    STRICT_FMSUB,
-    STRICT_FNMSUB,
-
-    // Conversions between float and half-float.
-    STRICT_CVTPS2PH,
-    STRICT_CVTPH2PS,
-
-    // Perform an FP80 add after changing precision control in FPCW.
-    STRICT_FP80_ADD,
-
-    /// Floating point max and min.
-    STRICT_FMAX,
-    STRICT_FMIN,
-    LAST_STRICTFP_OPCODE = STRICT_FMIN,
-
-    // Compare and swap.
-    FIRST_MEMORY_OPCODE,
-    LCMPXCHG_DAG = FIRST_MEMORY_OPCODE,
-    LCMPXCHG8_DAG,
-    LCMPXCHG16_DAG,
-    LCMPXCHG16_SAVE_RBX_DAG,
-
-    /// LOCK-prefixed arithmetic read-modify-write instructions.
-    /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
-    LADD,
-    LSUB,
-    LOR,
-    LXOR,
-    LAND,
-    LBTS,
-    LBTC,
-    LBTR,
-    LBTS_RM,
-    LBTC_RM,
-    LBTR_RM,
-
-    /// RAO arithmetic instructions.
-    /// OUTCHAIN = AADD(INCHAIN, PTR, RHS)
-    AADD,
-    AOR,
-    AXOR,
-    AAND,
-
-    // Load, scalar_to_vector, and zero extend.
-    VZEXT_LOAD,
-
-    // extract_vector_elt, store.
-    VEXTRACT_STORE,
-
-    // scalar broadcast from memory.
-    VBROADCAST_LOAD,
-
-    // subvector broadcast from memory.
-    SUBV_BROADCAST_LOAD,
-
-    // Store FP control word into i16 memory.
-    FNSTCW16m,
-
-    // Load FP control word from i16 memory.
-    FLDCW16m,
-
-    // Store x87 FPU environment into memory.
-    FNSTENVm,
-
-    // Load x87 FPU environment from memory.
-    FLDENVm,
-
-    /// This instruction implements FP_TO_SINT with the
-    /// integer destination in memory and a FP reg source.  This corresponds
-    /// to the X86::FIST*m instructions and the rounding mode change stuff. It
-    /// has two inputs (token chain and address) and two outputs (int value
-    /// and token chain). Memory VT specifies the type to store to.
-    FP_TO_INT_IN_MEM,
-
-    /// This instruction implements SINT_TO_FP with the
-    /// integer source in memory and FP reg result.  This corresponds to the
-    /// X86::FILD*m instructions. It has two inputs (token chain and address)
-    /// and two outputs (FP value and token chain). The integer source type is
-    /// specified by the memory VT.
-    FILD,
-
-    /// This instruction implements a fp->int store from FP stack
-    /// slots. This corresponds to the fist instruction. It takes a
-    /// chain operand, value to store, address, and glue. The memory VT
-    /// specifies the type to store as.
-    FIST,
-
-    /// This instruction implements an extending load to FP stack slots.
-    /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
-    /// operand, and ptr to load from. The memory VT specifies the type to
-    /// load from.
-    FLD,
-
-    /// This instruction implements a truncating store from FP stack
-    /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
-    /// chain operand, value to store, address, and glue. The memory VT
-    /// specifies the type to store as.
-    FST,
-
-    /// These instructions grab the address of the next argument
-    /// from a va_list. (reads and modifies the va_list in memory)
-    VAARG_64,
-    VAARG_X32,
-
-    // Vector truncating store with unsigned/signed saturation
-    VTRUNCSTOREUS,
-    VTRUNCSTORES,
-    // Vector truncating masked store with unsigned/signed saturation
-    VMTRUNCSTOREUS,
-    VMTRUNCSTORES,
-
-    // X86 specific gather and scatter
-    MGATHER,
-    MSCATTER,
-
-    // Key locker nodes that produce flags.
-    AESENC128KL,
-    AESDEC128KL,
-    AESENC256KL,
-    AESDEC256KL,
-    AESENCWIDE128KL,
-    AESDECWIDE128KL,
-    AESENCWIDE256KL,
-    AESDECWIDE256KL,
-
-    /// Compare and Add if Condition is Met. Compare value in operand 2 with
-    /// value in memory of operand 1. If condition of operand 4 is met, add
-    /// value operand 3 to m32 and write new value in operand 1. Operand 2 is
-    /// always updated with the original value from operand 1.
-    CMPCCXADD,
-
-    // Save xmm argument registers to the stack, according to %al. An operator
-    // is needed so that this can be expanded with control flow.
-    VASTART_SAVE_XMM_REGS,
-
-    // Conditional load/store instructions
-    CLOAD,
-    CSTORE,
-    LAST_MEMORY_OPCODE = CSTORE,
-  };
-  } // end namespace X86ISD
-
-  namespace X86 {
-    /// Current rounding mode is represented in bits 11:10 of FPSR. These
-    /// values are same as corresponding constants for rounding mode used
-    /// in glibc.
-  enum RoundingMode {
-    rmInvalid = -1,         // For handle Invalid rounding mode
-    rmToNearest = 0,        // FE_TONEAREST
-    rmDownward = 1 << 10,   // FE_DOWNWARD
-    rmUpward = 2 << 10,     // FE_UPWARD
-    rmTowardZero = 3 << 10, // FE_TOWARDZERO
-    rmMask = 3 << 10        // Bit mask selecting rounding mode
-  };
+class X86Subtarget;
+class X86TargetMachine;
+
+namespace X86ISD {
+// X86 Specific DAG Nodes
+enum NodeType : unsigned {
+  // Start the numbering where the builtin ops leave off.
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+  /// Bit scan forward.
+  BSF,
+  /// Bit scan reverse.
+  BSR,
+
+  /// X86 funnel/double shift i16 instructions. These correspond to
+  /// X86::SHLDW and X86::SHRDW instructions which have different amt
+  /// modulo rules to generic funnel shifts.
+  /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
+  FSHL,
+  FSHR,
+
+  /// Bitwise logical AND of floating point values. This corresponds
+  /// to X86::ANDPS or X86::ANDPD.
+  FAND,
+
+  /// Bitwise logical OR of floating point values. This corresponds
+  /// to X86::ORPS or X86::ORPD.
+  FOR,
+
+  /// Bitwise logical XOR of floating point values. This corresponds
+  /// to X86::XORPS or X86::XORPD.
+  FXOR,
+
+  ///  Bitwise logical ANDNOT of floating point values. This
+  /// corresponds to X86::ANDNPS or X86::ANDNPD.
+  FANDN,
+
+  /// These operations represent an abstract X86 call
+  /// instruction, which includes a bunch of information.  In particular the
+  /// operands of these node are:
+  ///
+  ///     #0 - The incoming token chain
+  ///     #1 - The callee
+  ///     #2 - The number of arg bytes the caller pushes on the stack.
+  ///     #3 - The number of arg bytes the callee pops off the stack.
+  ///     #4 - The value to pass in AL/AX/EAX (optional)
+  ///     #5 - The value to pass in DL/DX/EDX (optional)
+  ///
+  /// The result values of these nodes are:
+  ///
+  ///     #0 - The outgoing token chain
+  ///     #1 - The first register result value (optional)
+  ///     #2 - The second register result value (optional)
+  ///
+  CALL,
+
+  /// Same as call except it adds the NoTrack prefix.
+  NT_CALL,
+
+  // Pseudo for a OBJC call that gets emitted together with a special
+  // marker instruction.
+  CALL_RVMARKER,
+
+  /// The same as ISD::CopyFromReg except that this node makes it explicit
+  /// that it may lower to an x87 FPU stack pop. Optimizations should be more
+  /// cautious when handling this node than a normal CopyFromReg to avoid
+  /// removing a required FPU stack pop. A key requirement is optimizations
+  /// should not optimize any users of a chain that contains a
+  /// POP_FROM_X87_REG to use a chain from a point earlier than the
+  /// POP_FROM_X87_REG (which may remove a required FPU stack pop).
+  POP_FROM_X87_REG,
+
+  // Pseudo for a call to an imported function to ensure the correct machine
+  // instruction is emitted for Import Call Optimization.
+  IMP_CALL,
+
+  /// X86 compare and logical compare instructions.
+  CMP,
+  FCMP,
+  COMI,
+  UCOMI,
+
+  // X86 compare with Intrinsics similar to COMI.
+  COMX,
+  UCOMX,
+
+  /// X86 bit-test instructions.
+  BT,
+
+  /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+  /// operand, usually produced by a CMP instruction.
+  SETCC,
+
+  /// X86 Select
+  SELECTS,
+
+  /// X86 Constant-time Select, implemented with CMOV instruction. This is
+  /// used to implement constant-time select.
+  CTSELECT,
+
+  // Same as SETCC except it's materialized with a sbb and the value is all
+  // one's or all zero's.
+  SETCC_CARRY, // R = carry_bit ? ~0 : 0
+
+  /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+  /// Operands are two FP values to compare; result is a mask of
+  /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
+  FSETCC,
+
+  /// X86 FP SETCC, similar to above, but with output as an i1 mask and
+  /// and a version with SAE.
+  FSETCCM,
+  FSETCCM_SAE,
+
+  /// X86 conditional moves. Operand 0 and operand 1 are the two values
+  /// to select from. Operand 2 is the condition code, and operand 3 is the
+  /// flag operand produced by a CMP or TEST instruction.
+  CMOV,
+
+  /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+  /// is the block to branch if condition is true, operand 2 is the
+  /// condition code, and operand 3 is the flag operand produced by a CMP
+  /// or TEST instruction.
+  BRCOND,
+
+  /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
+  /// operand 1 is the target address.
+  NT_BRIND,
+
+  /// Return with a glue operand. Operand 0 is the chain operand, operand
+  /// 1 is the number of bytes of stack to pop.
+  RET_GLUE,
+
+  /// Return from interrupt. Operand 0 is the number of bytes to pop.
+  IRET,
+
+  /// Repeat fill, corresponds to X86::REP_STOSx.
+  REP_STOS,
+
+  /// Repeat move, corresponds to X86::REP_MOVSx.
+  REP_MOVS,
+
+  /// On Darwin, this node represents the result of the popl
+  /// at function entry, used for PIC code.
+  GlobalBaseReg,
+
+  /// A wrapper node for TargetConstantPool, TargetJumpTable,
+  /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
+  /// MCSymbol and TargetBlockAddress.
+  Wrapper,
+
+  /// Special wrapper used under X86-64 PIC mode for RIP
+  /// relative displacements.
+  WrapperRIP,
+
+  /// Copies a 64-bit value from an MMX vector to the low word
+  /// of an XMM vector, with the high word zero filled.
+  MOVQ2DQ,
+
+  /// Copies a 64-bit value from the low word of an XMM vector
+  /// to an MMX vector.
+  MOVDQ2Q,
+
+  /// Copies a 32-bit value from the low word of a MMX
+  /// vector to a GPR.
+  MMX_MOVD2W,
+
+  /// Copies a GPR into the low 32-bit word of a MMX vector
+  /// and zero out the high word.
+  MMX_MOVW2D,
+
+  /// Extract an 8-bit value from a vector and zero extend it to
+  /// i32, corresponds to X86::PEXTRB.
+  PEXTRB,
+
+  /// Extract a 16-bit value from a vector and zero extend it to
+  /// i32, corresponds to X86::PEXTRW.
+  PEXTRW,
+
+  /// Insert any element of a 4 x float vector into any element
+  /// of a destination 4 x floatvector.
+  INSERTPS,
+
+  /// Insert the lower 8-bits of a 32-bit value to a vector,
+  /// corresponds to X86::PINSRB.
+  PINSRB,
+
+  /// Insert the lower 16-bits of a 32-bit value to a vector,
+  /// corresponds to X86::PINSRW.
+  PINSRW,
+
+  /// Shuffle 16 8-bit values within a vector.
+  PSHUFB,
+
+  /// Compute Sum of Absolute Differences.
+  PSADBW,
+  /// Compute Double Block Packed Sum-Absolute-Differences
+  DBPSADBW,
+
+  /// Bitwise Logical AND NOT of Packed FP values.
+  ANDNP,
+
+  /// Blend where the selector is an immediate.
+  BLENDI,
+
+  /// Dynamic (non-constant condition) vector blend where only the sign bits
+  /// of the condition elements are used. This is used to enforce that the
+  /// condition mask is not valid for generic VSELECT optimizations. This
+  /// is also used to implement the intrinsics.
+  /// Operands are in VSELECT order: MASK, TRUE, FALSE
+  BLENDV,
+
+  /// Combined add and sub on an FP vector.
+  ADDSUB,
+
+  //  FP vector ops with rounding mode.
+  FADD_RND,
+  FADDS,
+  FADDS_RND,
+  FSUB_RND,
+  FSUBS,
+  FSUBS_RND,
+  FMUL_RND,
+  FMULS,
+  FMULS_RND,
+  FDIV_RND,
+  FDIVS,
+  FDIVS_RND,
+  FMAX_SAE,
+  FMAXS_SAE,
+  FMIN_SAE,
+  FMINS_SAE,
+  FSQRT_RND,
+  FSQRTS,
+  FSQRTS_RND,
+
+  // FP vector get exponent.
+  FGETEXP,
+  FGETEXP_SAE,
+  FGETEXPS,
+  FGETEXPS_SAE,
+  // Extract Normalized Mantissas.
+  VGETMANT,
+  VGETMANT_SAE,
+  VGETMANTS,
+  VGETMANTS_SAE,
+  // FP Scale.
+  SCALEF,
+  SCALEF_RND,
+  SCALEFS,
+  SCALEFS_RND,
+
+  /// Integer horizontal add/sub.
+  HADD,
+  HSUB,
+
+  /// Floating point horizontal add/sub.
+  FHADD,
+  FHSUB,
+
+  // Detect Conflicts Within a Vector
+  CONFLICT,
+
+  /// Floating point max and min.
+  FMAX,
+  FMIN,
+
+  /// Commutative FMIN and FMAX.
+  FMAXC,
+  FMINC,
+
+  /// Scalar intrinsic floating point max and min.
+  FMAXS,
+  FMINS,
+
+  /// Floating point reciprocal-sqrt and reciprocal approximation.
+  /// Note that these typically require refinement
+  /// in order to obtain suitable precision.
+  FRSQRT,
+  FRCP,
+
+  // AVX-512 reciprocal approximations with a little more precision.
+  RSQRT14,
+  RSQRT14S,
+  RCP14,
+  RCP14S,
+
+  // Thread Local Storage.
+  TLSADDR,
+
+  // Thread Local Storage. A call to get the start address
+  // of the TLS block for the current module.
+  TLSBASEADDR,
+
+  // Thread Local Storage.  When calling to an OS provided
+  // thunk at the address from an earlier relocation.
+  TLSCALL,
+
+  // Thread Local Storage. A descriptor containing pointer to
+  // code and to argument to get the TLS offset for the symbol.
+  TLSDESC,
+
+  // Exception Handling helpers.
+  EH_RETURN,
+
+  // SjLj exception handling setjmp.
+  EH_SJLJ_SETJMP,
+
+  // SjLj exception handling longjmp.
+  EH_SJLJ_LONGJMP,
+
+  // SjLj exception handling dispatch.
+  EH_SJLJ_SETUP_DISPATCH,
+
+  /// Tail call return. See X86TargetLowering::LowerCall for
+  /// the list of operands.
+  TC_RETURN,
+
+  // Vector move to low scalar and zero higher vector elements.
+  VZEXT_MOVL,
+
+  // Vector integer truncate.
+  VTRUNC,
+  // Vector integer truncate with unsigned/signed saturation.
+  VTRUNCUS,
+  VTRUNCS,
+
+  // Masked version of the above. Used when less than a 128-bit result is
+  // produced since the mask only applies to the lower elements and can't
+  // be represented by a select.
+  // SRC, PASSTHRU, MASK
+  VMTRUNC,
+  VMTRUNCUS,
+  VMTRUNCS,
+
+  // Vector FP extend.
+  VFPEXT,
+  VFPEXT_SAE,
+  VFPEXTS,
+  VFPEXTS_SAE,
+
+  // Vector FP round.
+  VFPROUND,
+  // Convert TWO packed single data to one packed data
+  VFPROUND2,
+  VFPROUND2_RND,
+  VFPROUND_RND,
+  VFPROUNDS,
+  VFPROUNDS_RND,
+
+  // Masked version of above. Used for v2f64->v4f32.
+  // SRC, PASSTHRU, MASK
+  VMFPROUND,
+
+  // 128-bit vector logical left / right shift
+  VSHLDQ,
+  VSRLDQ,
+
+  // Vector shift elements
+  VSHL,
+  VSRL,
+  VSRA,
+
+  // Vector variable shift
+  VSHLV,
+  VSRLV,
+  VSRAV,
+
+  // Vector shift elements by immediate
+  VSHLI,
+  VSRLI,
+  VSRAI,
+
+  // Shifts of mask registers.
+  KSHIFTL,
+  KSHIFTR,
+
+  // Bit rotate by immediate
+  VROTLI,
+  VROTRI,
+
+  // Vector packed double/float comparison.
+  CMPP,
+
+  // Vector integer comparisons.
+  PCMPEQ,
+  PCMPGT,
+
+  // v8i16 Horizontal minimum and position.
+  PHMINPOS,
+
+  MULTISHIFT,
+
+  /// Vector comparison generating mask bits for fp and
+  /// integer signed and unsigned data types.
+  CMPM,
+  // Vector mask comparison generating mask bits for FP values.
+  CMPMM,
+  // Vector mask comparison with SAE for FP values.
+  CMPMM_SAE,
+
+  // Arithmetic operations with FLAGS results.
+  ADD,
+  SUB,
+  ADC,
+  SBB,
+  SMUL,
+  UMUL,
+  OR,
+  XOR,
+  AND,
+
+  // Bit field extract.
+  BEXTR,
+  BEXTRI,
+
+  // Zero High Bits Starting with Specified Bit Position.
+  BZHI,
+
+  // Parallel extract and deposit.
+  PDEP,
+  PEXT,
+
+  // X86-specific multiply by immediate.
+  MUL_IMM,
+
+  // Vector sign bit extraction.
+  MOVMSK,
+
+  // Vector bitwise comparisons.
+  PTEST,
+
+  // Vector packed fp sign bitwise comparisons.
+  TESTP,
+
+  // OR/AND test for masks.
+  KORTEST,
+  KTEST,
+
+  // ADD for masks.
+  KADD,
+
+  // Several flavors of instructions with vector shuffle behaviors.
+  // Saturated signed/unnsigned packing.
+  PACKSS,
+  PACKUS,
+  // Intra-lane alignr.
+  PALIGNR,
+  // AVX512 inter-lane alignr.
+  VALIGN,
+  PSHUFD,
+  PSHUFHW,
+  PSHUFLW,
+  SHUFP,
+  // VBMI2 Concat & Shift.
+  VSHLD,
+  VSHRD,
+
+  // Shuffle Packed Values at 128-bit granularity.
+  SHUF128,
+  MOVDDUP,
+  MOVSHDUP,
+  MOVSLDUP,
+  MOVLHPS,
+  MOVHLPS,
+  MOVSD,
+  MOVSS,
+  MOVSH,
+  UNPCKL,
+  UNPCKH,
+  VPERMILPV,
+  VPERMILPI,
+  VPERMI,
+  VPERM2X128,
+
+  // Variable Permute (VPERM).
+  // Res = VPERMV MaskV, V0
+  VPERMV,
+
+  // 3-op Variable Permute (VPERMT2).
+  // Res = VPERMV3 V0, MaskV, V1
+  VPERMV3,
+
+  // Bitwise ternary logic.
+  VPTERNLOG,
+  // Fix Up Special Packed Float32/64 values.
+  VFIXUPIMM,
+  VFIXUPIMM_SAE,
+  VFIXUPIMMS,
+  VFIXUPIMMS_SAE,
+  // Range Restriction Calculation For Packed Pairs of Float32/64 values.
+  VRANGE,
+  VRANGE_SAE,
+  VRANGES,
+  VRANGES_SAE,
+  // Reduce - Perform Reduction Transformation on scalar\packed FP.
+  VREDUCE,
+  VREDUCE_SAE,
+  VREDUCES,
+  VREDUCES_SAE,
+  // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+  // Also used by the legacy (V)ROUND intrinsics where we mask out the
+  // scaling part of the immediate.
+  VRNDSCALE,
+  VRNDSCALE_SAE,
+  VRNDSCALES,
+  VRNDSCALES_SAE,
+  // Tests Types Of a FP Values for packed types.
+  VFPCLASS,
+  // Tests Types Of a FP Values for scalar types.
+  VFPCLASSS,
+
+  // Broadcast (splat) scalar or element 0 of a vector. If the operand is
+  // a vector, this node may change the vector length as part of the splat.
+  VBROADCAST,
+  // Broadcast mask to vector.
+  VBROADCASTM,
+
+  /// SSE4A Extraction and Insertion.
+  EXTRQI,
+  INSERTQI,
+
+  // XOP arithmetic/logical shifts.
+  VPSHA,
+  VPSHL,
+  // XOP signed/unsigned integer comparisons.
+  VPCOM,
+  VPCOMU,
+  // XOP packed permute bytes.
+  VPPERM,
+  // XOP two source permutation.
+  VPERMIL2,
+
+  // Vector multiply packed unsigned doubleword integers.
+  PMULUDQ,
+  // Vector multiply packed signed doubleword integers.
+  PMULDQ,
+  // Vector Multiply Packed UnsignedIntegers with Round and Scale.
+  MULHRS,
+
+  // Multiply and Add Packed Integers.
+  VPMADDUBSW,
+  VPMADDWD,
+
+  // AVX512IFMA multiply and add.
+  // NOTE: These are different than the instruction and perform
+  // op0 x op1 + op2.
+  VPMADD52L,
+  VPMADD52H,
+
+  // VNNI
+  VPDPBUSD,
+  VPDPBUSDS,
+  VPDPWSSD,
+  VPDPWSSDS,
+
+  // FMA nodes.
+  // We use the target independent ISD::FMA for the non-inverted case.
+  FNMADD,
+  FMSUB,
+  FNMSUB,
+  FMADDSUB,
+  FMSUBADD,
+
+  // FMA with rounding mode.
+  FMADD_RND,
+  FNMADD_RND,
+  FMSUB_RND,
+  FNMSUB_RND,
+  FMADDSUB_RND,
+  FMSUBADD_RND,
+
+  // AVX512-FP16 complex addition and multiplication.
+  VFMADDC,
+  VFMADDC_RND,
+  VFCMADDC,
+  VFCMADDC_RND,
+
+  VFMULC,
+  VFMULC_RND,
+  VFCMULC,
+  VFCMULC_RND,
+
+  VFMADDCSH,
+  VFMADDCSH_RND,
+  VFCMADDCSH,
+  VFCMADDCSH_RND,
+
+  VFMULCSH,
+  VFMULCSH_RND,
+  VFCMULCSH,
+  VFCMULCSH_RND,
+
+  VPDPBSUD,
+  VPDPBSUDS,
+  VPDPBUUD,
+  VPDPBUUDS,
+  VPDPBSSD,
+  VPDPBSSDS,
+
+  VPDPWSUD,
+  VPDPWSUDS,
+  VPDPWUSD,
+  VPDPWUSDS,
+  VPDPWUUD,
+  VPDPWUUDS,
+
+  VMINMAX,
+  VMINMAX_SAE,
+  VMINMAXS,
+  VMINMAXS_SAE,
+
+  CVTP2IBS,
+  CVTP2IUBS,
+  CVTP2IBS_RND,
+  CVTP2IUBS_RND,
+  CVTTP2IBS,
+  CVTTP2IUBS,
+  CVTTP2IBS_SAE,
+  CVTTP2IUBS_SAE,
+
+  MPSADBW,
+
+  VCVT2PH2BF8,
+  VCVT2PH2BF8S,
+  VCVT2PH2HF8,
+  VCVT2PH2HF8S,
+  VCVTBIASPH2BF8,
+  VCVTBIASPH2BF8S,
+  VCVTBIASPH2HF8,
+  VCVTBIASPH2HF8S,
+  VCVTPH2BF8,
+  VCVTPH2BF8S,
+  VCVTPH2HF8,
+  VCVTPH2HF8S,
+  VMCVTBIASPH2BF8,
+  VMCVTBIASPH2BF8S,
+  VMCVTBIASPH2HF8,
+  VMCVTBIASPH2HF8S,
+  VMCVTPH2BF8,
+  VMCVTPH2BF8S,
+  VMCVTPH2HF8,
+  VMCVTPH2HF8S,
+  VCVTHF82PH,
+
+  // Compress and expand.
+  COMPRESS,
+  EXPAND,
+
+  // Bits shuffle
+  VPSHUFBITQMB,
+
+  // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
+  SINT_TO_FP_RND,
+  UINT_TO_FP_RND,
+  SCALAR_SINT_TO_FP,
+  SCALAR_UINT_TO_FP,
+  SCALAR_SINT_TO_FP_RND,
+  SCALAR_UINT_TO_FP_RND,
+
+  // Vector float/double to signed/unsigned integer.
+  CVTP2SI,
+  CVTP2UI,
+  CVTP2SI_RND,
+  CVTP2UI_RND,
+  // Scalar float/double to signed/unsigned integer.
+  CVTS2SI,
+  CVTS2UI,
+  CVTS2SI_RND,
+  CVTS2UI_RND,
+
+  // Vector float/double to signed/unsigned integer with truncation.
+  CVTTP2SI,
+  CVTTP2UI,
+  CVTTP2SI_SAE,
+  CVTTP2UI_SAE,
+
+  // Saturation enabled Vector float/double to signed/unsigned
+  // integer with truncation.
+  CVTTP2SIS,
+  CVTTP2UIS,
+  CVTTP2SIS_SAE,
+  CVTTP2UIS_SAE,
+  // Masked versions of above. Used for v2f64 to v4i32.
+  // SRC, PASSTHRU, MASK
+  MCVTTP2SIS,
+  MCVTTP2UIS,
+
+  // Scalar float/double to signed/unsigned integer with truncation.
+  CVTTS2SI,
+  CVTTS2UI,
+  CVTTS2SI_SAE,
+  CVTTS2UI_SAE,
+
+  // Vector signed/unsigned integer to float/double.
+  CVTSI2P,
+  CVTUI2P,
+
+  // Scalar float/double to signed/unsigned integer with saturation.
+  CVTTS2SIS,
+  CVTTS2UIS,
+  CVTTS2SIS_SAE,
+  CVTTS2UIS_SAE,
+
+  // Masked versions of above. Used for v2f64->v4f32.
+  // SRC, PASSTHRU, MASK
+  MCVTP2SI,
+  MCVTP2UI,
+  MCVTTP2SI,
+  MCVTTP2UI,
+  MCVTSI2P,
+  MCVTUI2P,
+
+  // Custom handling for FP_TO_xINT_SAT
+  FP_TO_SINT_SAT,
+  FP_TO_UINT_SAT,
+
+  // Vector float to bfloat16.
+  // Convert packed single data to packed BF16 data
+  CVTNEPS2BF16,
+  // Masked version of above.
+  // SRC, PASSTHRU, MASK
+  MCVTNEPS2BF16,
+
+  // Dot product of BF16/FP16 pairs to accumulated into
+  // packed single precision.
+  DPBF16PS,
+  DPFP16PS,
+
+  // A stack checking function call. On Windows it's _chkstk call.
+  DYN_ALLOCA,
+
+  // For allocating variable amounts of stack space when using
+  // segmented stacks. Check if the current stacklet has enough space, and
+  // falls back to heap allocation if not.
+  SEG_ALLOCA,
+
+  // For allocating stack space when using stack clash protector.
+  // Allocation is performed by block, and each block is probed.
+  PROBED_ALLOCA,
+
+  // Memory barriers.
+  MFENCE,
+
+  // Get a random integer and indicate whether it is valid in CF.
+  RDRAND,
+
+  // Get a NIST SP800-90B & C compliant random integer and
+  // indicate whether it is valid in CF.
+  RDSEED,
+
+  // Protection keys
+  // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
+  // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
+  // value for ECX.
+  RDPKRU,
+  WRPKRU,
+
+  // SSE42 string comparisons.
+  // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
+  // will emit one or two instructions based on which results are used. If
+  // flags and index/mask this allows us to use a single instruction since
+  // we won't have to pick and opcode for flags. Instead we can rely on the
+  // DAG to CSE everything and decide at isel.
+  PCMPISTR,
+  PCMPESTR,
+
+  // Test if in transactional execution.
+  XTEST,
+
+  // Conversions between float and half-float.
+  CVTPS2PH,
+  CVTPS2PH_SAE,
+  CVTPH2PS,
+  CVTPH2PS_SAE,
+
+  // Masked version of above.
+  // SRC, RND, PASSTHRU, MASK
+  MCVTPS2PH,
+  MCVTPS2PH_SAE,
+
+  // Galois Field Arithmetic Instructions
+  GF2P8AFFINEINVQB,
+  GF2P8AFFINEQB,
+  GF2P8MULB,
+
+  // LWP insert record.
+  LWPINS,
+
+  // User level wait
+  UMWAIT,
+  TPAUSE,
+
+  // Enqueue Stores Instructions
+  ENQCMD,
+  ENQCMDS,
+
+  // For avx512-vp2intersect
+  VP2INTERSECT,
+
+  // User level interrupts - testui
+  TESTUI,
+
+  // Perform an FP80 add after changing precision control in FPCW.
+  FP80_ADD,
+
+  // Conditional compare instructions
+  CCMP,
+  CTEST,
+
+  /// X86 strict FP compare instructions.
+  FIRST_STRICTFP_OPCODE,
+  STRICT_FCMP = FIRST_STRICTFP_OPCODE,
+  STRICT_FCMPS,
+
+  // Vector packed double/float comparison.
+  STRICT_CMPP,
+
+  /// Vector comparison generating mask bits for fp and
+  /// integer signed and unsigned data types.
+  STRICT_CMPM,
+
+  // Vector float/double to signed/unsigned integer with truncation.
+  STRICT_CVTTP2SI,
+  STRICT_CVTTP2UI,
+
+  // Vector FP extend.
+  STRICT_VFPEXT,
+
+  // Vector FP round.
+  STRICT_VFPROUND,
+
+  // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+  // Also used by the legacy (V)ROUND intrinsics where we mask out the
+  // scaling part of the immediate.
+  STRICT_VRNDSCALE,
+
+  // Vector signed/unsigned integer to float/double.
+  STRICT_CVTSI2P,
+  STRICT_CVTUI2P,
+
+  // Strict FMA nodes.
+  STRICT_FNMADD,
+  STRICT_FMSUB,
+  STRICT_FNMSUB,
+
+  // Conversions between float and half-float.
+  STRICT_CVTPS2PH,
+  STRICT_CVTPH2PS,
+
+  // Perform an FP80 add after changing precision control in FPCW.
+  STRICT_FP80_ADD,
+
+  /// Floating point max and min.
+  STRICT_FMAX,
+  STRICT_FMIN,
+  LAST_STRICTFP_OPCODE = STRICT_FMIN,
+
+  // Compare and swap.
+  FIRST_MEMORY_OPCODE,
+  LCMPXCHG_DAG = FIRST_MEMORY_OPCODE,
+  LCMPXCHG8_DAG,
+  LCMPXCHG16_DAG,
+  LCMPXCHG16_SAVE_RBX_DAG,
+
+  /// LOCK-prefixed arithmetic read-modify-write instructions.
+  /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
+  LADD,
+  LSUB,
+  LOR,
+  LXOR,
+  LAND,
+  LBTS,
+  LBTC,
+  LBTR,
+  LBTS_RM,
+  LBTC_RM,
+  LBTR_RM,
+
+  /// RAO arithmetic instructions.
+  /// OUTCHAIN = AADD(INCHAIN, PTR, RHS)
+  AADD,
+  AOR,
+  AXOR,
+  AAND,
+
+  // Load, scalar_to_vector, and zero extend.
+  VZEXT_LOAD,
+
+  // extract_vector_elt, store.
+  VEXTRACT_STORE,
+
+  // scalar broadcast from memory.
+  VBROADCAST_LOAD,
+
+  // subvector broadcast from memory.
+  SUBV_BROADCAST_LOAD,
+
+  // Store FP control word into i16 memory.
+  FNSTCW16m,
+
+  // Load FP control word from i16 memory.
+  FLDCW16m,
+
+  // Store x87 FPU environment into memory.
+  FNSTENVm,
+
+  // Load x87 FPU environment from memory.
+  FLDENVm,
+
+  /// This instruction implements FP_TO_SINT with the
+  /// integer destination in memory and a FP reg source.  This corresponds
+  /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+  /// has two inputs (token chain and address) and two outputs (int value
+  /// and token chain). Memory VT specifies the type to store to.
+  FP_TO_INT_IN_MEM,
+
+  /// This instruction implements SINT_TO_FP with the
+  /// integer source in memory and FP reg result.  This corresponds to the
+  /// X86::FILD*m instructions. It has two inputs (token chain and address)
+  /// and two outputs (FP value and token chain). The integer source type is
+  /// specified by the memory VT.
+  FILD,
+
+  /// This instruction implements a fp->int store from FP stack
+  /// slots. This corresponds to the fist instruction. It takes a
+  /// chain operand, value to store, address, and glue. The memory VT
+  /// specifies the type to store as.
+  FIST,
+
+  /// This instruction implements an extending load to FP stack slots.
+  /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+  /// operand, and ptr to load from. The memory VT specifies the type to
+  /// load from.
+  FLD,
+
+  /// This instruction implements a truncating store from FP stack
+  /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+  /// chain operand, value to store, address, and glue. The memory VT
+  /// specifies the type to store as.
+  FST,
+
+  /// These instructions grab the address of the next argument
+  /// from a va_list. (reads and modifies the va_list in memory)
+  VAARG_64,
+  VAARG_X32,
+
+  // Vector truncating store with unsigned/signed saturation
+  VTRUNCSTOREUS,
+  VTRUNCSTORES,
+  // Vector truncating masked store with unsigned/signed saturation
+  VMTRUNCSTOREUS,
+  VMTRUNCSTORES,
+
+  // X86 specific gather and scatter
+  MGATHER,
+  MSCATTER,
+
+  // Key locker nodes that produce flags.
+  AESENC128KL,
+  AESDEC128KL,
+  AESENC256KL,
+  AESDEC256KL,
+  AESENCWIDE128KL,
+  AESDECWIDE128KL,
+  AESENCWIDE256KL,
+  AESDECWIDE256KL,
+
+  /// Compare and Add if Condition is Met. Compare value in operand 2 with
+  /// value in memory of operand 1. If condition of operand 4 is met, add
+  /// value operand 3 to m32 and write new value in operand 1. Operand 2 is
+  /// always updated with the original value from operand 1.
+  CMPCCXADD,
+
+  // Save xmm argument registers to the stack, according to %al. An operator
+  // is needed so that this can be expanded with control flow.
+  VASTART_SAVE_XMM_REGS,
+
+  // Conditional load/store instructions
+  CLOAD,
+  CSTORE,
+  LAST_MEMORY_OPCODE = CSTORE,
+};
+} // end namespace X86ISD
+
+namespace X86 {
+/// Current rounding mode is represented in bits 11:10 of FPSR. These
+/// values are same as corresponding constants for rounding mode used
+/// in glibc.
+enum RoundingMode {
+  rmInvalid = -1,         // For handle Invalid rounding mode
+  rmToNearest = 0,        // FE_TONEAREST
+  rmDownward = 1 << 10,   // FE_DOWNWARD
+  rmUpward = 2 << 10,     // FE_UPWARD
+  rmTowardZero = 3 << 10, // FE_TOWARDZERO
+  rmMask = 3 << 10        // Bit mask selecting rounding mode
+};
+} // namespace X86
+
+/// Define some predicates that are used for node matching.
+namespace X86 {
+/// Returns true if Elt is a constant zero or floating point constant +0.0.
+bool isZeroNode(SDValue Elt);
+
+/// Returns true of the given offset can be
+/// fit into displacement field of the instruction.
+bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                  bool hasSymbolicDisplacement);
+
+/// Determines whether the callee is required to pop its
+/// own arguments. Callee pop is necessary to support tail calls.
+bool isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg,
+                 bool GuaranteeTCO);
+
+/// If Op is a constant whose elements are all the same constant or
+/// undefined, return true and return the constant value in \p SplatVal.
+/// If we have undef bits that don't cover an entire element, we treat these
+/// as zero if AllowPartialUndefs is set, else we fail and return false.
+bool isConstantSplat(SDValue Op, APInt &SplatVal,
+                     bool AllowPartialUndefs = true);
+
+/// Check if Op is a load operation that could be folded into some other x86
+/// instruction as a memory operand. Example: vpaddd (%rdi), %xmm0, %xmm0.
+bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
+                 bool AssumeSingleUse = false);
+
+/// Check if Op is a load operation that could be folded into a vector splat
+/// instruction as a memory operand. Example: vbroadcastss 16(%rdi), %xmm2.
+bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
+                                     const X86Subtarget &Subtarget,
+                                     bool AssumeSingleUse = false);
+
+/// Check if Op is a value that could be used to fold a store into some
+/// other x86 instruction as a memory operand. Ex: pextrb $0, %xmm0, (%rdi).
+bool mayFoldIntoStore(SDValue Op);
+
+/// Check if Op is an operation that could be folded into a zero extend x86
+/// instruction.
+bool mayFoldIntoZeroExtend(SDValue Op);
+
+/// True if the target supports the extended frame for async Swift
+/// functions.
+bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget,
+                                        const MachineFunction &MF);
+
+/// Convert LLVM rounding mode to X86 rounding mode.
+int getRoundingModeX86(unsigned RM);
+
+} // end namespace X86
+
+//===--------------------------------------------------------------------===//
+//  X86 Implementation of the TargetLowering interface
+class X86TargetLowering final : public TargetLowering {
+public:
+  explicit X86TargetLowering(const X86TargetMachine &TM,
+                             const X86Subtarget &STI);
+
+  unsigned getJumpTableEncoding() const override;
+  bool useSoftFloat() const override;
+
+  void markLibCallAttributes(MachineFunction *MF, unsigned CC,
+                             ArgListTy &Args) const override;
+
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
+    return MVT::i8;
   }
 
-  /// Define some predicates that are used for node matching.
-  namespace X86 {
-    /// Returns true if Elt is a constant zero or floating point constant +0.0.
-    bool isZeroNode(SDValue Elt);
-
-    /// Returns true of the given offset can be
-    /// fit into displacement field of the instruction.
-    bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
-                                      bool hasSymbolicDisplacement);
-
-    /// Determines whether the callee is required to pop its
-    /// own arguments. Callee pop is necessary to support tail calls.
-    bool isCalleePop(CallingConv::ID CallingConv,
-                     bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
-
-    /// If Op is a constant whose elements are all the same constant or
-    /// undefined, return true and return the constant value in \p SplatVal.
-    /// If we have undef bits that don't cover an entire element, we treat these
-    /// as zero if AllowPartialUndefs is set, else we fail and return false.
-    bool isConstantSplat(SDValue Op, APInt &SplatVal,
-                         bool AllowPartialUndefs = true);
-
-    /// Check if Op is a load operation that could be folded into some other x86
-    /// instruction as a memory operand. Example: vpaddd (%rdi), %xmm0, %xmm0.
-    bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
-                     bool AssumeSingleUse = false);
-
-    /// Check if Op is a load operation that could be folded into a vector splat
-    /// instruction as a memory operand. Example: vbroadcastss 16(%rdi), %xmm2.
-    bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
-                                         const X86Subtarget &Subtarget,
-                                         bool AssumeSingleUse = false);
-
-    /// Check if Op is a value that could be used to fold a store into some
-    /// other x86 instruction as a memory operand. Ex: pextrb $0, %xmm0, (%rdi).
-    bool mayFoldIntoStore(SDValue Op);
-
-    /// Check if Op is an operation that could be folded into a zero extend x86
-    /// instruction.
-    bool mayFoldIntoZeroExtend(SDValue Op);
-
-    /// True if the target supports the extended frame for async Swift
-    /// functions.
-    bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget,
-                                            const MachineFunction &MF);
-
-    /// Convert LLVM rounding mode to X86 rounding mode.
-    int getRoundingModeX86(unsigned RM);
-
-  } // end namespace X86
-
-  //===--------------------------------------------------------------------===//
-  //  X86 Implementation of the TargetLowering interface
-  class X86TargetLowering final : public TargetLowering {
-  public:
-    explicit X86TargetLowering(const X86TargetMachine &TM,
-                               const X86Subtarget &STI);
-
-    unsigned getJumpTableEncoding() const override;
-    bool useSoftFloat() const override;
-
-    void markLibCallAttributes(MachineFunction *MF, unsigned CC,
-                               ArgListTy &Args) const override;
-
-    MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
-      return MVT::i8;
-    }
-
-    const MCExpr *
-    LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
-                              const MachineBasicBlock *MBB, unsigned uid,
-                              MCContext &Ctx) const override;
-
-    /// Returns relocation base for the given PIC jumptable.
-    SDValue getPICJumpTableRelocBase(SDValue Table,
-                                     SelectionDAG &DAG) const override;
-    const MCExpr *
-    getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
-                                 unsigned JTI, MCContext &Ctx) const override;
-
-    /// Return the desired alignment for ByVal aggregate
-    /// function arguments in the caller parameter area. For X86, aggregates
-    /// that contains are placed at 16-byte boundaries while the rest are at
-    /// 4-byte boundaries.
-    Align getByValTypeAlignment(Type *Ty, const DataLayout &DL) const override;
-
-    EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op,
-                            const AttributeList &FuncAttributes) const override;
-
-    /// Returns true if it's safe to use load / store of the
-    /// specified type to expand memcpy / memset inline. This is mostly true
-    /// for all types except for some special cases. For example, on X86
-    /// targets without SSE2 f64 load / store are done with fldl / fstpl which
-    /// also does type conversion. Note the specified type doesn't have to be
-    /// legal as the hook is used before type legalization.
-    bool isSafeMemOpType(MVT VT) const override;
-
-    bool isMemoryAccessFast(EVT VT, Align Alignment) const;
-
-    /// Returns true if the target allows unaligned memory accesses of the
-    /// specified type. Returns whether it is "fast" in the last argument.
-    bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
-                                        MachineMemOperand::Flags Flags,
-                                        unsigned *Fast) const override;
-
-    /// This function returns true if the memory access is aligned or if the
-    /// target allows this specific unaligned memory access. If the access is
-    /// allowed, the optional final parameter returns a relative speed of the
-    /// access (as defined by the target).
-    bool allowsMemoryAccess(
-        LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
-        Align Alignment,
-        MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
-        unsigned *Fast = nullptr) const override;
-
-    bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
-                            const MachineMemOperand &MMO,
-                            unsigned *Fast) const {
-      return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(),
-                                MMO.getAlign(), MMO.getFlags(), Fast);
-    }
-
-    /// Provide custom lowering hooks for some operations.
-    ///
-    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-
-    bool isSelectSupported(SelectSupportKind Kind) const override;
-
-    /// Replace the results of node with an illegal result
-    /// type with new values built out of custom code.
-    ///
-    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
-                            SelectionDAG &DAG) const override;
-
-    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-
-    bool preferABDSToABSWithNSW(EVT VT) const override;
-
-    bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT,
-                                   EVT ExtVT) const override;
-
-    bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond,
-                                           EVT VT) const override;
-
-    /// Return true if the target has native support for
-    /// the specified value type and it is 'desirable' to use the type for the
-    /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
-    /// instruction encodings are longer and some i16 instructions are slow.
-    bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
-
-    /// Return true if the target has native support for the
-    /// specified value type and it is 'desirable' to use the type. e.g. On x86
-    /// i16 is legal, but undesirable since i16 instruction encodings are longer
-    /// and some i16 instructions are slow.
-    bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
-
-    /// Return prefered fold type, Abs if this is a vector, AddAnd if its an
-    /// integer, None otherwise.
-    TargetLowering::AndOrSETCCFoldKind
-    isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp,
-                                       const SDNode *SETCC0,
-                                       const SDNode *SETCC1) const override;
-
-    /// Return the newly negated expression if the cost is not expensive and
-    /// set the cost in \p Cost to indicate that if it is cheaper or neutral to
-    /// do the negation.
-    SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
-                                 bool LegalOperations, bool ForCodeSize,
-                                 NegatibleCost &Cost,
-                                 unsigned Depth) const override;
+  const MCExpr *LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                          const MachineBasicBlock *MBB,
+                                          unsigned uid,
+                                          MCContext &Ctx) const override;
+
+  /// Returns relocation base for the given PIC jumptable.
+  SDValue getPICJumpTableRelocBase(SDValue Table,
+                                   SelectionDAG &DAG) const override;
+  const MCExpr *getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                                             unsigned JTI,
+                                             MCContext &Ctx) const override;
+
+  /// Return the desired alignment for ByVal aggregate
+  /// function arguments in the caller parameter area. For X86, aggregates
+  /// that contains are placed at 16-byte boundaries while the rest are at
+  /// 4-byte boundaries.
+  Align getByValTypeAlignment(Type *Ty, const DataLayout &DL) const override;
+
+  EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op,
+                          const AttributeList &FuncAttributes) const override;
+
+  /// Returns true if it's safe to use load / store of the
+  /// specified type to expand memcpy / memset inline. This is mostly true
+  /// for all types except for some special cases. For example, on X86
+  /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+  /// also does type conversion. Note the specified type doesn't have to be
+  /// legal as the hook is used before type legalization.
+  bool isSafeMemOpType(MVT VT) const override;
+
+  bool isMemoryAccessFast(EVT VT, Align Alignment) const;
+
+  /// Returns true if the target allows unaligned memory accesses of the
+  /// specified type. Returns whether it is "fast" in the last argument.
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
+                                      MachineMemOperand::Flags Flags,
+                                      unsigned *Fast) const override;
+
+  /// This function returns true if the memory access is aligned or if the
+  /// target allows this specific unaligned memory access. If the access is
+  /// allowed, the optional final parameter returns a relative speed of the
+  /// access (as defined by the target).
+  bool
+  allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
+                     unsigned AddrSpace, Align Alignment,
+                     MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+                     unsigned *Fast = nullptr) const override;
+
+  bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
+                          const MachineMemOperand &MMO, unsigned *Fast) const {
+    return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(),
+                              MMO.getAlign(), MMO.getFlags(), Fast);
+  }
+
+  /// Provide custom lowering hooks for some operations.
+  ///
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  bool isSelectSupported(SelectSupportKind Kind) const override;
+
+  /// Replace the results of node with an illegal result
+  /// type with new values built out of custom code.
+  ///
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+
+  bool preferABDSToABSWithNSW(EVT VT) const override;
+
+  bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override;
+
+  bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond,
+                                         EVT VT) const override;
+
+  /// Return true if the target has native support for
+  /// the specified value type and it is 'desirable' to use the type for the
+  /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+  /// instruction encodings are longer and some i16 instructions are slow.
+  bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
+
+  /// Return true if the target has native support for the
+  /// specified value type and it is 'desirable' to use the type. e.g. On x86
+  /// i16 is legal, but undesirable since i16 instruction encodings are longer
+  /// and some i16 instructions are slow.
+  bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
+
+  /// Return prefered fold type, Abs if this is a vector, AddAnd if its an
+  /// integer, None otherwise.
+  TargetLowering::AndOrSETCCFoldKind
+  isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp,
+                                     const SDNode *SETCC0,
+                                     const SDNode *SETCC1) const override;
+
+  /// Return the newly negated expression if the cost is not expensive and
+  /// set the cost in \p Cost to indicate that if it is cheaper or neutral to
+  /// do the negation.
+  SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                               bool LegalOperations, bool ForCodeSize,
+                               NegatibleCost &Cost,
+                               unsigned Depth) const override;
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *MBB) const override;
+
+  /// This method returns the name of a target specific DAG node.
+  const char *getTargetNodeName(unsigned Opcode) const override;
+
+  /// Do not merge vector stores after legalization because that may conflict
+  /// with x86-specific store splitting optimizations.
+  bool mergeStoresAfterLegalization(EVT MemVT) const override {
+    return !MemVT.isVector();
+  }
+
+  bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+                        const MachineFunction &MF) const override;
+
+  bool isCheapToSpeculateCttz(Type *Ty) const override;
+
+  bool isCheapToSpeculateCtlz(Type *Ty) const override;
+
+  bool isCtlzFast() const override;
+
+  bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
+    // If the pair to store is a mixture of float and int values, we will
+    // save two bitwise instructions and one float-to-int instruction and
+    // increase one store instruction. There is potentially a more
+    // significant benefit because it avoids the float->int domain switch
+    // for input value. So It is more likely a win.
+    if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
+        (LTy.isInteger() && HTy.isFloatingPoint()))
+      return true;
+    // If the pair only contains int values, we will save two bitwise
+    // instructions and increase one store instruction (costing one more
+    // store buffer). Since the benefit is more blurred so we leave
+    // such pair out until we get testcase to prove it is a win.
+    return false;
+  }
+
+  bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
+  bool hasAndNotCompare(SDValue Y) const override;
+
+  bool hasAndNot(SDValue Y) const override;
+
+  bool hasBitTest(SDValue X, SDValue Y) const override;
 
-    MachineBasicBlock *
-    EmitInstrWithCustomInserter(MachineInstr &MI,
-                                MachineBasicBlock *MBB) const override;
-
-    /// This method returns the name of a target specific DAG node.
-    const char *getTargetNodeName(unsigned Opcode) const override;
-
-    /// Do not merge vector stores after legalization because that may conflict
-    /// with x86-specific store splitting optimizations.
-    bool mergeStoresAfterLegalization(EVT MemVT) const override {
-      return !MemVT.isVector();
-    }
-
-    bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
-                          const MachineFunction &MF) const override;
-
-    bool isCheapToSpeculateCttz(Type *Ty) const override;
-
-    bool isCheapToSpeculateCtlz(Type *Ty) const override;
-
-    bool isCtlzFast() const override;
-
-    bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
-      // If the pair to store is a mixture of float and int values, we will
-      // save two bitwise instructions and one float-to-int instruction and
-      // increase one store instruction. There is potentially a more
-      // significant benefit because it avoids the float->int domain switch
-      // for input value. So It is more likely a win.
-      if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
-          (LTy.isInteger() && HTy.isFloatingPoint()))
-        return true;
-      // If the pair only contains int values, we will save two bitwise
-      // instructions and increase one store instruction (costing one more
-      // store buffer). Since the benefit is more blurred so we leave
-      // such pair out until we get testcase to prove it is a win.
+  bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+      SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
+      unsigned OldShiftOpcode, unsigned NewShiftOpcode,
+      SelectionDAG &DAG) const override;
+
+  unsigned preferedOpcodeForCmpEqPiecesOfOperand(
+      EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
+      const APInt &ShiftOrRotateAmt,
+      const std::optional<APInt> &AndMask) const override;
+
+  bool preferScalarizeSplat(SDNode *N) const override;
+
+  CondMergingParams
+  getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
+                                const Value *Rhs) const override;
+
+  bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override;
+
+  bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
+
+  bool shouldTransformSignedTruncationCheck(EVT XVT,
+                                            unsigned KeptBits) const override {
+    // For vectors, we don't have a preference..
+    if (XVT.isVector())
       return false;
-    }
 
-    bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
-
-    bool hasAndNotCompare(SDValue Y) const override;
-
-    bool hasAndNot(SDValue Y) const override;
-
-    bool hasBitTest(SDValue X, SDValue Y) const override;
-
-    bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
-        SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
-        unsigned OldShiftOpcode, unsigned NewShiftOpcode,
-        SelectionDAG &DAG) const override;
-
-    unsigned preferedOpcodeForCmpEqPiecesOfOperand(
-        EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
-        const APInt &ShiftOrRotateAmt,
-        const std::optional<APInt> &AndMask) const override;
-
-    bool preferScalarizeSplat(SDNode *N) const override;
-
-    CondMergingParams
-    getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
-                                  const Value *Rhs) const override;
-
-    bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override;
-
-    bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
-
-    bool
-    shouldTransformSignedTruncationCheck(EVT XVT,
-                                         unsigned KeptBits) const override {
-      // For vectors, we don't have a preference..
-      if (XVT.isVector())
-        return false;
-
-      auto VTIsOk = [](EVT VT) -> bool {
-        return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
-               VT == MVT::i64;
-      };
-
-      // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
-      // XVT will be larger than KeptBitsVT.
-      MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
-      return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
-    }
-
-    ShiftLegalizationStrategy
-    preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
-                                       unsigned ExpansionFactor) const override;
-
-    bool shouldSplatInsEltVarIndex(EVT VT) const override;
-
-    bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override {
-      // Converting to sat variants holds little benefit on X86 as we will just
-      // need to saturate the value back using fp arithmatic.
-      return Op != ISD::FP_TO_UINT_SAT && isOperationLegalOrCustom(Op, VT);
-    }
-
-    bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
-      return VT.isScalarInteger();
-    }
-
-    /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
-    MVT hasFastEqualityCompare(unsigned NumBits) const override;
-
-    /// Return the value type to use for ISD::SETCC.
-    EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
-                           EVT VT) const override;
-
-    bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
-                                      const APInt &DemandedElts,
-                                      TargetLoweringOpt &TLO) const override;
-
-    /// Determine which of the bits specified in Mask are known to be either
-    /// zero or one and return them in the KnownZero/KnownOne bitsets.
-    void computeKnownBitsForTargetNode(const SDValue Op,
-                                       KnownBits &Known,
-                                       const APInt &DemandedElts,
-                                       const SelectionDAG &DAG,
-                                       unsigned Depth = 0) const override;
-
-    /// Determine the number of bits in the operation that are sign bits.
-    unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
-                                             const APInt &DemandedElts,
-                                             const SelectionDAG &DAG,
-                                             unsigned Depth) const override;
-
-    bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op,
-                                                 const APInt &DemandedElts,
-                                                 APInt &KnownUndef,
-                                                 APInt &KnownZero,
-                                                 TargetLoweringOpt &TLO,
-                                                 unsigned Depth) const override;
-
-    bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,
-                                                    const APInt &DemandedElts,
-                                                    unsigned MaskIndex,
-                                                    TargetLoweringOpt &TLO,
-                                                    unsigned Depth) const;
+    auto VTIsOk = [](EVT VT) -> bool {
+      return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
+             VT == MVT::i64;
+    };
+
+    // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
+    // XVT will be larger than KeptBitsVT.
+    MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
+    return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
+  }
+
+  ShiftLegalizationStrategy
+  preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N,
+                                     unsigned ExpansionFactor) const override;
 
-    bool SimplifyDemandedBitsForTargetNode(SDValue Op,
-                                           const APInt &DemandedBits,
+  bool shouldSplatInsEltVarIndex(EVT VT) const override;
+
+  bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override {
+    // Converting to sat variants holds little benefit on X86 as we will just
+    // need to saturate the value back using fp arithmatic.
+    return Op != ISD::FP_TO_UINT_SAT && isOperationLegalOrCustom(Op, VT);
+  }
+
+  bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+    return VT.isScalarInteger();
+  }
+
+  /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
+  MVT hasFastEqualityCompare(unsigned NumBits) const override;
+
+  /// Return the value type to use for ISD::SETCC.
+  EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
+                         EVT VT) const override;
+
+  bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
+                                    const APInt &DemandedElts,
+                                    TargetLoweringOpt &TLO) const override;
+
+  /// Determine which of the bits specified in Mask are known to be either
+  /// zero or one and return them in the KnownZero/KnownOne bitsets.
+  void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+                                     const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  /// Determine the number of bits in the operation that are sign bits.
+  unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
                                            const APInt &DemandedElts,
-                                           KnownBits &Known,
-                                           TargetLoweringOpt &TLO,
+                                           const SelectionDAG &DAG,
                                            unsigned Depth) const override;
 
-    SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
-        SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
-        SelectionDAG &DAG, unsigned Depth) const override;
+  bool SimplifyDemandedVectorEltsForTargetNode(
+      SDValue Op, const APInt &DemandedElts, APInt &KnownUndef,
+      APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override;
 
-    bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(
-        SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
-        bool PoisonOnly, unsigned Depth) const override;
+  bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,
+                                                  const APInt &DemandedElts,
+                                                  unsigned MaskIndex,
+                                                  TargetLoweringOpt &TLO,
+                                                  unsigned Depth) const;
 
-    bool canCreateUndefOrPoisonForTargetNode(
-        SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
-        bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override;
+  bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits,
+                                         const APInt &DemandedElts,
+                                         KnownBits &Known,
+                                         TargetLoweringOpt &TLO,
+                                         unsigned Depth) const override;
 
-    bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts,
-                                   APInt &UndefElts, const SelectionDAG &DAG,
-                                   unsigned Depth) const override;
+  SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
+      SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+      SelectionDAG &DAG, unsigned Depth) const override;
 
-    bool isTargetCanonicalConstantNode(SDValue Op) const override {
-      // Peek through bitcasts/extracts/inserts to see if we have a vector
-      // load/broadcast from memory.
-      while (Op.getOpcode() == ISD::BITCAST ||
-             Op.getOpcode() == ISD::EXTRACT_SUBVECTOR ||
-             (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
-              Op.getOperand(0).isUndef()))
-        Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0);
+  bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(
+      SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+      bool PoisonOnly, unsigned Depth) const override;
 
-      return Op.getOpcode() == X86ISD::VBROADCAST_LOAD ||
-             Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
-             (Op.getOpcode() == ISD::LOAD &&
-              getTargetConstantFromLoad(cast<LoadSDNode>(Op))) ||
-             TargetLowering::isTargetCanonicalConstantNode(Op);
-    }
+  bool canCreateUndefOrPoisonForTargetNode(SDValue Op,
+                                           const APInt &DemandedElts,
+                                           const SelectionDAG &DAG,
+                                           bool PoisonOnly, bool ConsiderFlags,
+                                           unsigned Depth) const override;
+
+  bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts,
+                                 APInt &UndefElts, const SelectionDAG &DAG,
+                                 unsigned Depth) const override;
+
+  bool isTargetCanonicalConstantNode(SDValue Op) const override {
+    // Peek through bitcasts/extracts/inserts to see if we have a vector
+    // load/broadcast from memory.
+    while (
+        Op.getOpcode() == ISD::BITCAST ||
+        Op.getOpcode() == ISD::EXTRACT_SUBVECTOR ||
+        (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef()))
+      Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0);
+
+    return Op.getOpcode() == X86ISD::VBROADCAST_LOAD ||
+           Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
+           (Op.getOpcode() == ISD::LOAD &&
+            getTargetConstantFromLoad(cast<LoadSDNode>(Op))) ||
+           TargetLowering::isTargetCanonicalConstantNode(Op);
+  }
 
-    bool isTargetCanonicalSelect(SDNode *N) const override;
+  bool isTargetCanonicalSelect(SDNode *N) const override;
 
-    const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
+  const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
 
-    SDValue unwrapAddress(SDValue N) const override;
+  SDValue unwrapAddress(SDValue N) const override;
 
-    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+  SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
 
-    ConstraintType getConstraintType(StringRef Constraint) const override;
+  ConstraintType getConstraintType(StringRef Constraint) const override;
 
-    /// Examine constraint string and operand type and determine a weight value.
-    /// The operand object must already have been set up with the operand type.
-    ConstraintWeight
-      getSingleConstraintMatchWeight(AsmOperandInfo &Info,
-                                     const char *Constraint) const override;
+  /// Examine constraint string and operand type and determine a weight value.
+  /// The operand object must already have been set up with the operand type.
+  ConstraintWeight
+  getSingleConstraintMatchWeight(AsmOperandInfo &Info,
+                                 const char *Constraint) const override;
 
-    const char *LowerXConstraint(EVT ConstraintVT) const override;
+  const char *LowerXConstraint(EVT ConstraintVT) const override;
 
-    /// Lower the specified operand into the Ops vector. If it is invalid, don't
-    /// add anything to Ops. If hasMemory is true it means one of the asm
-    /// constraint of the inline asm instruction being processed is 'm'.
-    void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint,
-                                      std::vector<SDValue> &Ops,
+  /// Lower the specified operand into the Ops vector. If it is invalid, don't
+  /// add anything to Ops. If hasMemory is true it means one of the asm
+  /// constraint of the inline asm instruction being processed is 'm'.
+  void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+  InlineAsm::ConstraintCode
+  getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
+    if (ConstraintCode == "v")
+      return InlineAsm::ConstraintCode::v;
+    return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
+  }
+
+  /// Handle Lowering flag assembly outputs.
+  SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
+                                      const SDLoc &DL,
+                                      const AsmOperandInfo &Constraint,
                                       SelectionDAG &DAG) const override;
 
-    InlineAsm::ConstraintCode
-    getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
-      if (ConstraintCode == "v")
-        return InlineAsm::ConstraintCode::v;
-      return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
-    }
-
-    /// Handle Lowering flag assembly outputs.
-    SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
-                                        const SDLoc &DL,
-                                        const AsmOperandInfo &Constraint,
-                                        SelectionDAG &DAG) const override;
-
-    /// Given a physical register constraint
-    /// (e.g. {edx}), return the register number and the register class for the
-    /// register.  This should only be used for C_Register constraints.  On
-    /// error, this returns a register number of 0.
-    std::pair<unsigned, const TargetRegisterClass *>
-    getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                 StringRef Constraint, MVT VT) const override;
-
-    /// Return true if the addressing mode represented
-    /// by AM is legal for this target, for a load/store of the specified type.
-    bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
-                               Type *Ty, unsigned AS,
-                               Instruction *I = nullptr) const override;
-
-    bool addressingModeSupportsTLS(const GlobalValue &GV) const override;
-
-    /// Return true if the specified immediate is legal
-    /// icmp immediate, that is the target has icmp instructions which can
-    /// compare a register against the immediate without having to materialize
-    /// the immediate into a register.
-    bool isLegalICmpImmediate(int64_t Imm) const override;
-
-    /// Return true if the specified immediate is legal
-    /// add immediate, that is the target has add instructions which can
-    /// add a register and the immediate without having to materialize
-    /// the immediate into a register.
-    bool isLegalAddImmediate(int64_t Imm) const override;
-
-    bool isLegalStoreImmediate(int64_t Imm) const override;
-
-    /// Add x86-specific opcodes to the default list.
-    bool isBinOp(unsigned Opcode) const override;
-
-    /// Returns true if the opcode is a commutative binary operation.
-    bool isCommutativeBinOp(unsigned Opcode) const override;
-
-    /// Return true if it's free to truncate a value of
-    /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
-    /// register EAX to i16 by referencing its sub-register AX.
-    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
-    bool isTruncateFree(EVT VT1, EVT VT2) const override;
-
-    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
-
-    /// Return true if any actual instruction that defines a
-    /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
-    /// register. This does not necessarily include registers defined in
-    /// unknown ways, such as incoming arguments, or copies from unknown
-    /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
-    /// does not necessarily apply to truncate instructions. e.g. on x86-64,
-    /// all instructions that define 32-bit values implicit zero-extend the
-    /// result out to 64 bits.
-    bool isZExtFree(Type *Ty1, Type *Ty2) const override;
-    bool isZExtFree(EVT VT1, EVT VT2) const override;
-    bool isZExtFree(SDValue Val, EVT VT2) const override;
-
-    bool shouldConvertPhiType(Type *From, Type *To) const override;
-
-    /// Return true if folding a vector load into ExtVal (a sign, zero, or any
-    /// extend node) is profitable.
-    bool isVectorLoadExtDesirable(SDValue) const override;
-
-    /// Return true if an FMA operation is faster than a pair of fmul and fadd
-    /// instructions. fmuladd intrinsics will be expanded to FMAs when this
-    /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
-    bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+  /// Given a physical register constraint
+  /// (e.g. {edx}), return the register number and the register class for the
+  /// register.  This should only be used for C_Register constraints.  On
+  /// error, this returns a register number of 0.
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+
+  /// Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type.
+  bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
+                             unsigned AS,
+                             Instruction *I = nullptr) const override;
+
+  bool addressingModeSupportsTLS(const GlobalValue &GV) const override;
+
+  /// Return true if the specified immediate is legal
+  /// icmp immediate, that is the target has icmp instructions which can
+  /// compare a register against the immediate without having to materialize
+  /// the immediate into a register.
+  bool isLegalICmpImmediate(int64_t Imm) const override;
+
+  /// Return true if the specified immediate is legal
+  /// add immediate, that is the target has add instructions which can
+  /// add a register and the immediate without having to materialize
+  /// the immediate into a register.
+  bool isLegalAddImmediate(int64_t Imm) const override;
+
+  bool isLegalStoreImmediate(int64_t Imm) const override;
+
+  /// Add x86-specific opcodes to the default list.
+  bool isBinOp(unsigned Opcode) const override;
+
+  /// Returns true if the opcode is a commutative binary operation.
+  bool isCommutativeBinOp(unsigned Opcode) const override;
+
+  /// Return true if it's free to truncate a value of
+  /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+  /// register EAX to i16 by referencing its sub-register AX.
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+  bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+  bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
+
+  /// Return true if any actual instruction that defines a
+  /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+  /// register. This does not necessarily include registers defined in
+  /// unknown ways, such as incoming arguments, or copies from unknown
+  /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+  /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+  /// all instructions that define 32-bit values implicit zero-extend the
+  /// result out to 64 bits.
+  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+  bool isZExtFree(EVT VT1, EVT VT2) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
+
+  bool shouldConvertPhiType(Type *From, Type *To) const override;
+
+  /// Return true if folding a vector load into ExtVal (a sign, zero, or any
+  /// extend node) is profitable.
+  bool isVectorLoadExtDesirable(SDValue) const override;
+
+  /// Return true if an FMA operation is faster than a pair of fmul and fadd
+  /// instructions. fmuladd intrinsics will be expanded to FMAs when this
+  /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
+  bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                  EVT VT) const override;
+
+  /// Return true if it's profitable to narrow operations of type SrcVT to
+  /// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not
+  /// from i32 to i16.
+  bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
+
+  bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
+                                            unsigned SelectOpcode, SDValue X,
+                                            SDValue Y) const override;
+
+  /// Given an intrinsic, checks if on the target the intrinsic will need to map
+  /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
+  /// true and stores the intrinsic information into the IntrinsicInfo that was
+  /// passed to the function.
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          MachineFunction &MF,
+                          unsigned Intrinsic) const override;
+
+  /// Returns true if the target can instruction select the
+  /// specified FP immediate natively. If false, the legalizer will
+  /// materialize the FP immediate as a load from a constant pool.
+  bool isFPImmLegal(const APFloat &Imm, EVT VT,
+                    bool ForCodeSize) const override;
+
+  /// Targets can use this to indicate that they only support *some*
+  /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
+  /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
+  /// be legal.
+  bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
+
+  /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
+  /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
+  /// constant pool entry.
+  bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
+
+  /// Returns true if lowering to a jump table is allowed.
+  bool areJTsAllowed(const Function *Fn) const override;
+
+  MVT getPreferredSwitchConditionType(LLVMContext &Context,
+                                      EVT ConditionVT) const override;
+
+  /// If true, then instruction selection should
+  /// seek to shrink the FP constant of the specified type to a smaller type
+  /// in order to save space and / or reduce runtime.
+  bool ShouldShrinkFPConstant(EVT VT) const override;
+
+  /// Return true if we believe it is correct and profitable to reduce the
+  /// load node to a smaller type.
+  bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
+                             std::optional<unsigned> ByteOffset) const override;
+
+  /// Return true if the specified scalar FP type is computed in an SSE
+  /// register, not on the X87 floating point stack.
+  bool isScalarFPTypeInSSEReg(EVT VT) const;
+
+  /// Returns true if it is beneficial to convert a load of a constant
+  /// to just the constant itself.
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                         Type *Ty) const override;
+
+  bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
+
+  bool convertSelectOfConstantsToMath(EVT VT) const override;
+
+  bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+                              SDValue C) const override;
+
+  /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+  /// with this index.
+  bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+                               unsigned Index) const override;
+
+  /// Scalar ops always have equal or better analysis/performance/power than
+  /// the vector equivalent, so this always makes sense if the scalar op is
+  /// supported.
+  bool shouldScalarizeBinop(SDValue) const override;
+
+  /// Extract of a scalar FP value from index 0 of a vector is free.
+  bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
+    EVT EltVT = VT.getScalarType();
+    return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
+  }
+
+  /// Overflow nodes should get combined/lowered to optimal instructions
+  /// (they should allow eliminating explicit compares by getting flags from
+  /// math ops).
+  bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+                            bool MathUsed) const override;
+
+  bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem,
+                                    unsigned AddrSpace) const override {
+    // If we can replace more than 2 scalar stores, there will be a reduction
+    // in instructions even after we add a vector constant load.
+    return IsZero || NumElem > 2;
+  }
+
+  bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+                               const SelectionDAG &DAG,
+                               const MachineMemOperand &MMO) const override;
+
+  Register getRegisterByName(const char *RegName, LLT VT,
+                             const MachineFunction &MF) const override;
+
+  /// If a physical register, this returns the register that receives the
+  /// exception address on entry to an EH pad.
+  Register
+  getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+  /// If a physical register, this returns the register that receives the
+  /// exception typeid on entry to a landing pad.
+  Register
+  getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
+  bool needsFixedCatchObjects() const override;
+
+  /// This method returns a target specific FastISel object,
+  /// or null if the target does not support "fast" ISel.
+  FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                           const TargetLibraryInfo *libInfo) const override;
+
+  /// If the target has a standard location for the stack protector cookie,
+  /// returns the address of that location. Otherwise, returns nullptr.
+  Value *getIRStackGuard(IRBuilderBase &IRB) const override;
+
+  bool useLoadStackGuardNode(const Module &M) const override;
+  bool useStackGuardXorFP() const override;
+  void insertSSPDeclarations(Module &M) const override;
+  SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
+                              const SDLoc &DL) const override;
+
+  /// Return true if the target stores SafeStack pointer at a fixed offset in
+  /// some non-standard address space, and populates the address space and
+  /// offset as appropriate.
+  Value *getSafeStackPointerLocation(IRBuilderBase &IRB) const override;
+
+  std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
+                                        SDValue Chain, SDValue Pointer,
+                                        MachinePointerInfo PtrInfo,
+                                        Align Alignment,
+                                        SelectionDAG &DAG) const;
+
+  /// Customize the preferred legalization strategy for certain types.
+  LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
+
+  bool softPromoteHalfType() const override { return true; }
+
+  MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
                                     EVT VT) const override;
 
-    /// Return true if it's profitable to narrow operations of type SrcVT to
-    /// DestVT. e.g. on x86, it's profitable to narrow from i32 to i8 but not
-    /// from i32 to i16.
-    bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override;
-
-    bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
-                                              unsigned SelectOpcode, SDValue X,
-                                              SDValue Y) const override;
-
-    /// Given an intrinsic, checks if on the target the intrinsic will need to map
-    /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
-    /// true and stores the intrinsic information into the IntrinsicInfo that was
-    /// passed to the function.
-    bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
-                            MachineFunction &MF,
-                            unsigned Intrinsic) const override;
-
-    /// Returns true if the target can instruction select the
-    /// specified FP immediate natively. If false, the legalizer will
-    /// materialize the FP immediate as a load from a constant pool.
-    bool isFPImmLegal(const APFloat &Imm, EVT VT,
-                      bool ForCodeSize) const override;
-
-    /// Targets can use this to indicate that they only support *some*
-    /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
-    /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
-    /// be legal.
-    bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
-
-    /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
-    /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
-    /// constant pool entry.
-    bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
-
-    /// Returns true if lowering to a jump table is allowed.
-    bool areJTsAllowed(const Function *Fn) const override;
-
-    MVT getPreferredSwitchConditionType(LLVMContext &Context,
-                                        EVT ConditionVT) const override;
-
-    /// If true, then instruction selection should
-    /// seek to shrink the FP constant of the specified type to a smaller type
-    /// in order to save space and / or reduce runtime.
-    bool ShouldShrinkFPConstant(EVT VT) const override;
-
-    /// Return true if we believe it is correct and profitable to reduce the
-    /// load node to a smaller type.
-    bool
-    shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
-                          std::optional<unsigned> ByteOffset) const override;
-
-    /// Return true if the specified scalar FP type is computed in an SSE
-    /// register, not on the X87 floating point stack.
-    bool isScalarFPTypeInSSEReg(EVT VT) const;
-
-    /// Returns true if it is beneficial to convert a load of a constant
-    /// to just the constant itself.
-    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                           Type *Ty) const override;
-
-    bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
-
-    bool convertSelectOfConstantsToMath(EVT VT) const override;
-
-    bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
-                                SDValue C) const override;
-
-    /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
-    /// with this index.
-    bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
-                                 unsigned Index) const override;
-
-    /// Scalar ops always have equal or better analysis/performance/power than
-    /// the vector equivalent, so this always makes sense if the scalar op is
-    /// supported.
-    bool shouldScalarizeBinop(SDValue) const override;
-
-    /// Extract of a scalar FP value from index 0 of a vector is free.
-    bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
-      EVT EltVT = VT.getScalarType();
-      return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
-    }
+  unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+                                         CallingConv::ID CC,
+                                         EVT VT) const override;
 
-    /// Overflow nodes should get combined/lowered to optimal instructions
-    /// (they should allow eliminating explicit compares by getting flags from
-    /// math ops).
-    bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
-                              bool MathUsed) const override;
+  unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
+                                                CallingConv::ID CC, EVT VT,
+                                                EVT &IntermediateVT,
+                                                unsigned &NumIntermediates,
+                                                MVT &RegisterVT) const override;
 
-    bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem,
-                                      unsigned AddrSpace) const override {
-      // If we can replace more than 2 scalar stores, there will be a reduction
-      // in instructions even after we add a vector constant load.
-      return IsZero || NumElem > 2;
-    }
-
-    bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
-                                 const SelectionDAG &DAG,
-                                 const MachineMemOperand &MMO) const override;
-
-    Register getRegisterByName(const char* RegName, LLT VT,
-                               const MachineFunction &MF) const override;
-
-    /// If a physical register, this returns the register that receives the
-    /// exception address on entry to an EH pad.
-    Register
-    getExceptionPointerRegister(const Constant *PersonalityFn) const override;
-
-    /// If a physical register, this returns the register that receives the
-    /// exception typeid on entry to a landing pad.
-    Register
-    getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
-
-    bool needsFixedCatchObjects() const override;
-
-    /// This method returns a target specific FastISel object,
-    /// or null if the target does not support "fast" ISel.
-    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
-                             const TargetLibraryInfo *libInfo) const override;
-
-    /// If the target has a standard location for the stack protector cookie,
-    /// returns the address of that location. Otherwise, returns nullptr.
-    Value *getIRStackGuard(IRBuilderBase &IRB) const override;
-
-    bool useLoadStackGuardNode(const Module &M) const override;
-    bool useStackGuardXorFP() const override;
-    void insertSSPDeclarations(Module &M) const override;
-    SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
-                                const SDLoc &DL) const override;
+  bool functionArgumentNeedsConsecutiveRegisters(
+      Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+      const DataLayout &DL) const override;
 
+  bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
-    /// Return true if the target stores SafeStack pointer at a fixed offset in
-    /// some non-standard address space, and populates the address space and
-    /// offset as appropriate.
-    Value *getSafeStackPointerLocation(IRBuilderBase &IRB) const override;
+  bool supportSwiftError() const override;
 
-    std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
-                                          SDValue Chain, SDValue Pointer,
-                                          MachinePointerInfo PtrInfo,
-                                          Align Alignment,
-                                          SelectionDAG &DAG) const;
-
-    /// Customize the preferred legalization strategy for certain types.
-    LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
+  bool supportKCFIBundles() const override { return true; }
 
-    bool softPromoteHalfType() const override { return true; }
-
-    MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
-                                      EVT VT) const override;
+  MachineInstr *EmitKCFICheck(MachineBasicBlock &MBB,
+                              MachineBasicBlock::instr_iterator &MBBI,
+                              const TargetInstrInfo *TII) const override;
 
-    unsigned getNumRegistersForCallingConv(LLVMContext &Context,
-                                           CallingConv::ID CC,
-                                           EVT VT) const override;
+  bool hasStackProbeSymbol(const MachineFunction &MF) const override;
+  bool hasInlineStackProbe(const MachineFunction &MF) const override;
+  StringRef getStackProbeSymbolName(const MachineFunction &MF) const override;
 
-    unsigned getVectorTypeBreakdownForCallingConv(
-        LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
-        unsigned &NumIntermediates, MVT &RegisterVT) const override;
+  unsigned getStackProbeSize(const MachineFunction &MF) const;
 
-    bool functionArgumentNeedsConsecutiveRegisters(
-        Type *Ty, CallingConv::ID CallConv, bool isVarArg,
-        const DataLayout &DL) const override;
+  bool hasVectorBlend() const override { return true; }
 
-    bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
-
-    bool supportSwiftError() const override;
-
-    bool supportKCFIBundles() const override { return true; }
-
-    MachineInstr *EmitKCFICheck(MachineBasicBlock &MBB,
-                                MachineBasicBlock::instr_iterator &MBBI,
-                                const TargetInstrInfo *TII) const override;
-
-    bool hasStackProbeSymbol(const MachineFunction &MF) const override;
-    bool hasInlineStackProbe(const MachineFunction &MF) const override;
-    StringRef getStackProbeSymbolName(const MachineFunction &MF) const override;
-
-    unsigned getStackProbeSize(const MachineFunction &MF) const;
-
-    bool hasVectorBlend() const override { return true; }
-
-    unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
+  unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
-    bool isInlineAsmTargetBranch(const SmallVectorImpl<StringRef> &AsmStrs,
-                                 unsigned OpNo) const override;
+  bool isInlineAsmTargetBranch(const SmallVectorImpl<StringRef> &AsmStrs,
+                               unsigned OpNo) const override;
 
-    SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
-                            MachineMemOperand *MMO, SDValue &NewLoad,
-                            SDValue Ptr, SDValue PassThru,
-                            SDValue Mask) const override;
-    SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
-                             MachineMemOperand *MMO, SDValue Ptr, SDValue Val,
-                             SDValue Mask) const override;
+  SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                          MachineMemOperand *MMO, SDValue &NewLoad, SDValue Ptr,
+                          SDValue PassThru, SDValue Mask) const override;
+  SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                           MachineMemOperand *MMO, SDValue Ptr, SDValue Val,
+                           SDValue Mask) const override;
 
-    /// Lower interleaved load(s) into target specific
-    /// instructions/intrinsics.
-    bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
-                              ArrayRef<ShuffleVectorInst *> Shuffles,
-                              ArrayRef<unsigned> Indices, unsigned Factor,
-                              const APInt &GapMask) const override;
+  /// Lower interleaved load(s) into target specific
+  /// instructions/intrinsics.
+  bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
+                            ArrayRef<ShuffleVectorInst *> Shuffles,
+                            ArrayRef<unsigned> Indices, unsigned Factor,
+                            const APInt &GapMask) const override;
 
-    /// Lower interleaved store(s) into target specific
-    /// instructions/intrinsics.
-    bool lowerInterleavedStore(Instruction *Store, Value *Mask,
-                               ShuffleVectorInst *SVI, unsigned Factor,
-                               const APInt &GapMask) const override;
+  /// Lower interleaved store(s) into target specific
+  /// instructions/intrinsics.
+  bool lowerInterleavedStore(Instruction *Store, Value *Mask,
+                             ShuffleVectorInst *SVI, unsigned Factor,
+                             const APInt &GapMask) const override;
 
-    SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
-                                   int JTI, SelectionDAG &DAG) const override;
+  SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
+                                 int JTI, SelectionDAG &DAG) const override;
 
-    Align getPrefLoopAlignment(MachineLoop *ML) const override;
+  Align getPrefLoopAlignment(MachineLoop *ML) const override;
 
-    EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override {
-      if (VT == MVT::f80)
-        return EVT::getIntegerVT(Context, 96);
-      return TargetLoweringBase::getTypeToTransformTo(Context, VT);
-    }
+  EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override {
+    if (VT == MVT::f80)
+      return EVT::getIntegerVT(Context, 96);
+    return TargetLoweringBase::getTypeToTransformTo(Context, VT);
+  }
 
-  protected:
-    std::pair<const TargetRegisterClass *, uint8_t>
-    findRepresentativeClass(const TargetRegisterInfo *TRI,
-                            MVT VT) const override;
+protected:
+  std::pair<const TargetRegisterClass *, uint8_t>
+  findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override;
 
-  private:
-    /// Keep a reference to the X86Subtarget around so that we can
-    /// make the right decision when generating code for different targets.
-    const X86Subtarget &Subtarget;
+private:
+  /// Keep a reference to the X86Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const X86Subtarget &Subtarget;
 
-    /// A list of legal FP immediates.
-    std::vector<APFloat> LegalFPImmediates;
+  /// A list of legal FP immediates.
+  std::vector<APFloat> LegalFPImmediates;
 
-    /// Indicate that this x86 target can instruction
-    /// select the specified FP immediate natively.
-    void addLegalFPImmediate(const APFloat& Imm) {
-      LegalFPImmediates.push_back(Imm);
-    }
+  /// Indicate that this x86 target can instruction
+  /// select the specified FP immediate natively.
+  void addLegalFPImmediate(const APFloat &Imm) {
+    LegalFPImmediates.push_back(Imm);
+  }
 
-    SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
-                            CallingConv::ID CallConv, bool isVarArg,
-                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                            const SDLoc &dl, SelectionDAG &DAG,
-                            SmallVectorImpl<SDValue> &InVals,
-                            uint32_t *RegMask) const;
-    SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
-                             const SmallVectorImpl<ISD::InputArg> &ArgInfo,
-                             const SDLoc &dl, SelectionDAG &DAG,
-                             const CCValAssign &VA, MachineFrameInfo &MFI,
-                             unsigned i) const;
-    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
-                             const SDLoc &dl, SelectionDAG &DAG,
-                             const CCValAssign &VA,
-                             ISD::ArgFlagsTy Flags, bool isByval) const;
-
-    // Call lowering helpers.
-
-    /// Check whether the call is eligible for tail call optimization. Targets
-    /// that want to do tail call optimization should implement this function.
-    bool IsEligibleForTailCallOptimization(
-        TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo,
-        SmallVectorImpl<CCValAssign> &ArgLocs, bool IsCalleePopSRet) const;
-    SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
-                                    SDValue Chain, bool IsTailCall,
-                                    bool Is64Bit, int FPDiff,
-                                    const SDLoc &dl) const;
-
-    unsigned GetAlignedArgumentStackSize(unsigned StackSize,
-                                         SelectionDAG &DAG) const;
-
-    unsigned getAddressSpace() const;
-
-    SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned,
-                            SDValue &Chain) const;
-    SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const;
-
-    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-
-    unsigned getGlobalWrapperKind(const GlobalValue *GV,
-                                  const unsigned char OpFlags) const;
-    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
-
-    /// Creates target global address or external symbol nodes for calls or
-    /// other uses.
-    SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall,
-                                  bool *IsImpCall) const;
-
-    SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG,
-                                    SDValue &Chain) const;
-    SDValue LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
-
-    SDValue
-    LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                         const SmallVectorImpl<ISD::InputArg> &Ins,
-                         const SDLoc &dl, SelectionDAG &DAG,
-                         SmallVectorImpl<SDValue> &InVals) const override;
-    SDValue LowerCall(CallLoweringInfo &CLI,
-                      SmallVectorImpl<SDValue> &InVals) const override;
-
-    SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                        const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        const SmallVectorImpl<SDValue> &OutVals,
-                        const SDLoc &dl, SelectionDAG &DAG) const override;
-
-    bool supportSplitCSR(MachineFunction *MF) const override {
-      return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
-          MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
-    }
-    void initializeSplitCSR(MachineBasicBlock *Entry) const override;
-    void insertCopiesSplitCSR(
+  SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          const SDLoc &dl, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals,
+                          uint32_t *RegMask) const;
+  SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
+                           const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+                           const SDLoc &dl, SelectionDAG &DAG,
+                           const CCValAssign &VA, MachineFrameInfo &MFI,
+                           unsigned i) const;
+  SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+                           const SDLoc &dl, SelectionDAG &DAG,
+                           const CCValAssign &VA, ISD::ArgFlagsTy Flags,
+                           bool isByval) const;
+
+  // Call lowering helpers.
+
+  /// Check whether the call is eligible for tail call optimization. Targets
+  /// that want to do tail call optimization should implement this function.
+  bool IsEligibleForTailCallOptimization(TargetLowering::CallLoweringInfo &CLI,
+                                         CCState &CCInfo,
+                                         SmallVectorImpl<CCValAssign> &ArgLocs,
+                                         bool IsCalleePopSRet) const;
+  SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+                                  SDValue Chain, bool IsTailCall, bool Is64Bit,
+                                  int FPDiff, const SDLoc &dl) const;
+
+  unsigned GetAlignedArgumentStackSize(unsigned StackSize,
+                                       SelectionDAG &DAG) const;
+
+  unsigned getAddressSpace() const;
+
+  SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned,
+                          SDValue &Chain) const;
+  SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const;
+
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+
+  unsigned getGlobalWrapperKind(const GlobalValue *GV,
+                                const unsigned char OpFlags) const;
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+
+  /// Creates target global address or external symbol nodes for calls or
+  /// other uses.
+  SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, bool ForCall,
+                                bool *IsImpCall) const;
+
+  SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCTSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSET_FPENV_MEM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerWin64_FP_TO_INT128(SDValue Op, SelectionDAG &DAG,
+                                  SDValue &Chain) const;
+  SDValue LowerWin64_INT128_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
+                      SelectionDAG &DAG) const override;
+
+  bool supportSplitCSR(MachineFunction *MF) const override {
+    return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
+           MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
+  }
+  void initializeSplitCSR(MachineBasicBlock *Entry) const override;
+  void insertCopiesSplitCSR(
       MachineBasicBlock *Entry,
       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
 
-    bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
 
-    bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+  bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
 
-    EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
-                            ISD::NodeType ExtendKind) const override;
+  EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
+                          ISD::NodeType ExtendKind) const override;
 
-    bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-                        bool isVarArg,
-                        const SmallVectorImpl<ISD::OutputArg> &Outs,
-                        LLVMContext &Context,
-                        const Type *RetTy) const override;
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context, const Type *RetTy) const override;
 
-    const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
-    ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
+  const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
+  ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
 
-    TargetLoweringBase::AtomicExpansionKind
-    shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
-    TargetLoweringBase::AtomicExpansionKind
-    shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
-    TargetLoweringBase::AtomicExpansionKind
-    shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
-    TargetLoweringBase::AtomicExpansionKind
-    shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
-    void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
-    void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
+  TargetLoweringBase::AtomicExpansionKind
+  shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+  TargetLoweringBase::AtomicExpansionKind
+  shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+  TargetLoweringBase::AtomicExpansionKind
+  shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+  TargetLoweringBase::AtomicExpansionKind
+  shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+  void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
+  void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 
-    LoadInst *
-    lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+  LoadInst *lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
 
-    bool needsCmpXchgNb(Type *MemType) const;
+  bool needsCmpXchgNb(Type *MemType) const;
 
-    void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
-                                MachineBasicBlock *DispatchBB, int FI) const;
+  void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
+                              MachineBasicBlock *DispatchBB, int FI) const;
 
-    // Utility function to emit the low-level va_arg code for X86-64.
-    MachineBasicBlock *
-    EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
+  // Utility function to emit the low-level va_arg code for X86-64.
+  MachineBasicBlock *EmitVAARGWithCustomInserter(MachineInstr &MI,
+                                                 MachineBasicBlock *MBB) const;
+
+  /// Utility function to emit the xmm reg save portion of va_start.
+  MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
+                                               MachineInstr &MI2,
+                                               MachineBasicBlock *BB) const;
 
-    /// Utility function to emit the xmm reg save portion of va_start.
-    MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
-                                                 MachineInstr &MI2,
-                                                 MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
+                                       MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
+  MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
                                          MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
-                                           MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
+                                          MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
-                                            MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
+                                             MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
-                                               MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
+                                        MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
-                                          MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
+                                              MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
-                                                MachineBasicBlock *BB) const;
+  MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
+                                      MachineBasicBlock *MBB) const;
 
-    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
-                                        MachineBasicBlock *MBB) const;
+  void emitSetJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const;
 
-    void emitSetJmpShadowStackFix(MachineInstr &MI,
-                                  MachineBasicBlock *MBB) const;
+  MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
+                                       MachineBasicBlock *MBB) const;
 
-    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
-                                         MachineBasicBlock *MBB) const;
+  MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
+                                               MachineBasicBlock *MBB) const;
 
-    MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
-                                                 MachineBasicBlock *MBB) const;
+  MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
+                                           MachineBasicBlock *MBB) const;
+
+  MachineBasicBlock *emitPatchableEventCall(MachineInstr &MI,
+                                            MachineBasicBlock *MBB) const;
+
+  /// Emit flags for the given setcc condition and operands. Also returns the
+  /// corresponding X86 condition code constant in X86CC.
+  SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+                            const SDLoc &dl, SelectionDAG &DAG,
+                            SDValue &X86CC) const;
+
+  bool optimizeFMulOrFDivAsShiftAddBitcast(SDNode *N, SDValue FPConst,
+                                           SDValue IntPow2) const override;
+
+  /// Check if replacement of SQRT with RSQRT should be disabled.
+  bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override;
+
+  /// Use rsqrt* to speed up sqrt calculations.
+  SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
+                          int &RefinementSteps, bool &UseOneConstNR,
+                          bool Reciprocal) const override;
+
+  /// Use rcp* to speed up fdiv calculations.
+  SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
+                           int &RefinementSteps) const override;
+
+  /// Reassociate floating point divisions into multiply by reciprocal.
+  unsigned combineRepeatedFPDivisors() const override;
+
+  SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                        SmallVectorImpl<SDNode *> &Created) const override;
+
+  SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
+                  SDValue V2) const;
+};
+
+namespace X86 {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo);
+} // end namespace X86
+
+// X86 specific Gather/Scatter nodes.
+// The class has the same order of operands as MaskedGatherScatterSDNode for
+// convenience.
+class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode {
+public:
+  // This is a intended as a utility and should never be directly created.
+  X86MaskedGatherScatterSDNode() = delete;
+  ~X86MaskedGatherScatterSDNode() = delete;
+
+  const SDValue &getBasePtr() const { return getOperand(3); }
+  const SDValue &getIndex() const { return getOperand(4); }
+  const SDValue &getMask() const { return getOperand(2); }
+  const SDValue &getScale() const { return getOperand(5); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == X86ISD::MGATHER ||
+           N->getOpcode() == X86ISD::MSCATTER;
+  }
+};
+
+class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
+public:
+  const SDValue &getPassThru() const { return getOperand(1); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == X86ISD::MGATHER;
+  }
+};
+
+class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
+public:
+  const SDValue &getValue() const { return getOperand(1); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == X86ISD::MSCATTER;
+  }
+};
+
+/// Generate unpacklo/unpackhi shuffle mask.
+void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+                             bool Unary);
 
-    MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
-                                             MachineBasicBlock *MBB) const;
-
-    MachineBasicBlock *emitPatchableEventCall(MachineInstr &MI,
-                                              MachineBasicBlock *MBB) const;
-
-    /// Emit flags for the given setcc condition and operands. Also returns the
-    /// corresponding X86 condition code constant in X86CC.
-    SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
-                              const SDLoc &dl, SelectionDAG &DAG,
-                              SDValue &X86CC) const;
-
-    bool optimizeFMulOrFDivAsShiftAddBitcast(SDNode *N, SDValue FPConst,
-                                             SDValue IntPow2) const override;
-
-    /// Check if replacement of SQRT with RSQRT should be disabled.
-    bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override;
-
-    /// Use rsqrt* to speed up sqrt calculations.
-    SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
-                            int &RefinementSteps, bool &UseOneConstNR,
-                            bool Reciprocal) const override;
-
-    /// Use rcp* to speed up fdiv calculations.
-    SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
-                             int &RefinementSteps) const override;
-
-    /// Reassociate floating point divisions into multiply by reciprocal.
-    unsigned combineRepeatedFPDivisors() const override;
-
-    SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
-                          SmallVectorImpl<SDNode *> &Created) const override;
-
-    SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
-                    SDValue V2) const;
-  };
-
-  namespace X86 {
-    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
-                             const TargetLibraryInfo *libInfo);
-  } // end namespace X86
-
-  // X86 specific Gather/Scatter nodes.
-  // The class has the same order of operands as MaskedGatherScatterSDNode for
-  // convenience.
-  class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode {
-  public:
-    // This is a intended as a utility and should never be directly created.
-    X86MaskedGatherScatterSDNode() = delete;
-    ~X86MaskedGatherScatterSDNode() = delete;
-
-    const SDValue &getBasePtr() const { return getOperand(3); }
-    const SDValue &getIndex()   const { return getOperand(4); }
-    const SDValue &getMask()    const { return getOperand(2); }
-    const SDValue &getScale()   const { return getOperand(5); }
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::MGATHER ||
-             N->getOpcode() == X86ISD::MSCATTER;
-    }
-  };
-
-  class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
-  public:
-    const SDValue &getPassThru() const { return getOperand(1); }
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::MGATHER;
-    }
-  };
-
-  class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
-  public:
-    const SDValue &getValue() const { return getOperand(1); }
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::MSCATTER;
-    }
-  };
-
-  /// Generate unpacklo/unpackhi shuffle mask.
-  void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
-                               bool Unary);
-
-  /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
-  /// imposed by AVX and specific to the unary pattern. Example:
-  /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
-  /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
-  void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo);
+/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
+/// imposed by AVX and specific to the unary pattern. Example:
+/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
+/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
+void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 765db86ffafb3..d73c3aa0e1e82 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -690,8 +690,7 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
                     .addImm(31));
   } else {
     // Negate to convert 1 -> 0xFFFFFFFF, 0 -> 0x00000000 (negl %eax)
-    recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR)
-                    .addReg(TmpGPR));
+    recordInstr(BuildMI(*MBB, MI, DL, get(X86::NEG32r), TmpGPR).addReg(TmpGPR));
   }
 
   // Broadcast to TmpX (vector mask)
@@ -848,7 +847,8 @@ bool X86InstrInfo::expandCtSelectVector(MachineInstr &MI) const {
                     .setMIFlags(MachineInstr::MIFlag::NoMerge));
   }
 
-  assert(FirstInstr && LastInstr && "Expected at least one expanded instruction");
+  assert(FirstInstr && LastInstr &&
+         "Expected at least one expanded instruction");
   auto BundleEnd = LastInstr->getIterator();
   finalizeBundle(*MBB, FirstInstr->getIterator(), std::next(BundleEnd));
 
@@ -916,25 +916,28 @@ bool X86InstrInfo::expandCtSelectWithCMOV(MachineInstr &MI) const {
 
 /// Expand i386-specific CTSELECT pseudo instructions (post-RA, constant-time)
 /// These internal pseudos receive a pre-materialized condition byte from the
-/// custom inserter, avoiding EFLAGS corruption issues during i64 type legalization.
+/// custom inserter, avoiding EFLAGS corruption issues during i64 type
+/// legalization.
 bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
   MachineBasicBlock *MBB = MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
   // CTSELECT_I386_INT_GRxxrr has operands: (outs dst, tmp_byte, tmp_mask),
   // (ins src1, src2, cond_byte)
-  // Note: cond_byte is pre-materialized by custom inserter, not EFLAGS-dependent
+  // Note: cond_byte is pre-materialized by custom inserter, not
+  // EFLAGS-dependent
   Register DstReg = MI.getOperand(0).getReg();
   Register TmpByteReg = MI.getOperand(1).getReg();
   Register TmpMaskReg = MI.getOperand(2).getReg();
   Register Src1Reg = MI.getOperand(3).getReg();
   Register Src2Reg = MI.getOperand(4).getReg();
-  Register CondByteReg = MI.getOperand(5).getReg();  // Pre-materialized condition byte
+  Register CondByteReg =
+      MI.getOperand(5).getReg(); // Pre-materialized condition byte
 
   // Determine instruction opcodes based on register width
   unsigned MovZXOp, NegOp, MovOp, AndOp, NotOp, OrOp;
   if (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) {
-    MovZXOp = 0;  // No zero-extend needed for GR8
+    MovZXOp = 0; // No zero-extend needed for GR8
     NegOp = X86::NEG8r;
     MovOp = X86::MOV8rr;
     AndOp = X86::AND8rr;
@@ -963,8 +966,8 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
   // Step 1: Copy pre-materialized condition byte to TmpByteReg
   // This allows the bundle to work with allocated temporaries
   auto I1 = BuildMI(*MBB, MI, DL, get(X86::MOV8rr), TmpByteReg)
-      .addReg(CondByteReg)
-      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+                .addReg(CondByteReg)
+                .setMIFlag(MachineInstr::MIFlag::NoMerge);
   auto BundleStart = I1->getIterator();
 
   // Step 2: Zero-extend condition byte to register width (0 or 1)
@@ -975,7 +978,9 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
   }
 
   // Step 3: Convert condition to bitmask (NEG: 1 -> 0xFFFF..., 0 -> 0x0000...)
-  Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr) ? TmpByteReg : TmpMaskReg;
+  Register MaskReg = (MI.getOpcode() == X86::CTSELECT_I386_INT_GR8rr)
+                         ? TmpByteReg
+                         : TmpMaskReg;
   BuildMI(*MBB, MI, DL, get(NegOp), MaskReg)
       .addReg(MaskReg)
       .setMIFlag(MachineInstr::MIFlag::NoMerge);
@@ -1003,9 +1008,9 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
 
   // Step 8: Final result: (src1 & mask) | (src2 & ~mask)
   auto LI = BuildMI(*MBB, MI, DL, get(OrOp), DstReg)
-      .addReg(DstReg)
-      .addReg(MaskReg)
-      .setMIFlag(MachineInstr::MIFlag::NoMerge);
+                .addReg(DstReg)
+                .addReg(MaskReg)
+                .setMIFlag(MachineInstr::MIFlag::NoMerge);
 
   // Bundle all generated instructions for atomic execution before removing MI
   auto BundleEnd = std::next(LI->getIterator());
@@ -1014,11 +1019,12 @@ bool X86InstrInfo::expandCtSelectIntWithoutCMOV(MachineInstr &MI) const {
     finalizeBundle(*MBB, BundleStart, BundleEnd);
   }
 
-  // TODO: Optimization opportunity - The register allocator may choose callee-saved
-  // registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg, causing unnecessary
-  // save/restore overhead. Consider constraining these to caller-saved register
-  // classes (e.g., GR8_AL, GR32_CallSaved) in the TableGen definitions to improve
-  // constant-time performance by eliminating prologue/epilogue instructions.
+  // TODO: Optimization opportunity - The register allocator may choose
+  // callee-saved registers (e.g., %ebx, %esi) for TmpByteReg/TmpMaskReg,
+  // causing unnecessary save/restore overhead. Consider constraining these to
+  // caller-saved register classes (e.g., GR8_AL, GR32_CallSaved) in the
+  // TableGen definitions to improve constant-time performance by eliminating
+  // prologue/epilogue instructions.
 
   // Remove the original pseudo instruction
   MI.eraseFromParent();
@@ -1306,8 +1312,7 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
   return isPICBase;
 }
 
-bool X86InstrInfo::isReMaterializableImpl(
-    const MachineInstr &MI) const {
+bool X86InstrInfo::isReMaterializableImpl(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   default:
     // This function should only be called for opcodes with the ReMaterializable
@@ -1826,32 +1831,32 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   switch (MIOpc) {
   default:
     llvm_unreachable("Unreachable!");
-  CASE_NF(SHL8ri)
-  CASE_NF(SHL16ri) {
-    unsigned ShAmt = MI.getOperand(2).getImm();
-    MIB.addReg(0)
-        .addImm(1LL << ShAmt)
-        .addReg(InRegLEA, RegState::Kill)
-        .addImm(0)
-        .addReg(0);
-    break;
-  }
-  CASE_NF(INC8r)
-  CASE_NF(INC16r)
+    CASE_NF(SHL8ri)
+    CASE_NF(SHL16ri) {
+      unsigned ShAmt = MI.getOperand(2).getImm();
+      MIB.addReg(0)
+          .addImm(1LL << ShAmt)
+          .addReg(InRegLEA, RegState::Kill)
+          .addImm(0)
+          .addReg(0);
+      break;
+    }
+    CASE_NF(INC8r)
+    CASE_NF(INC16r)
     addRegOffset(MIB, InRegLEA, true, 1);
     break;
-  CASE_NF(DEC8r)
-  CASE_NF(DEC16r)
+    CASE_NF(DEC8r)
+    CASE_NF(DEC16r)
     addRegOffset(MIB, InRegLEA, true, -1);
     break;
-  CASE_NF(ADD8ri)
-  CASE_NF(ADD16ri)
+    CASE_NF(ADD8ri)
+    CASE_NF(ADD16ri)
   case X86::ADD8ri_DB:
   case X86::ADD16ri_DB:
     addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
     break;
-  CASE_NF(ADD8rr)
-  CASE_NF(ADD16rr)
+    CASE_NF(ADD8rr)
+    CASE_NF(ADD16rr)
   case X86::ADD8rr_DB:
   case X86::ADD16rr_DB: {
     Src2 = MI.getOperand(2).getReg();
@@ -1989,128 +1994,129 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
   switch (MIOpc) {
   default:
     llvm_unreachable("Unreachable!");
-  CASE_NF(SHL64ri) {
-    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
-    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt))
-      return nullptr;
-
-    // LEA can't handle RSP.
-    if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
-                                        Src.getReg(), &X86::GR64_NOSPRegClass))
-      return nullptr;
+    CASE_NF(SHL64ri) {
+      assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+      unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+      if (!isTruncatedShiftCountForLEA(ShAmt))
+        return nullptr;
 
-    NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
-                .add(Dest)
-                .addReg(0)
-                .addImm(1LL << ShAmt)
-                .add(Src)
-                .addImm(0)
-                .addReg(0);
-    break;
-  }
-  CASE_NF(SHL32ri) {
-    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
-    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt))
-      return nullptr;
+      // LEA can't handle RSP.
+      if (Src.getReg().isVirtual() &&
+          !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                             &X86::GR64_NOSPRegClass))
+        return nullptr;
 
-    unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+      NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
+                  .add(Dest)
+                  .addReg(0)
+                  .addImm(1LL << ShAmt)
+                  .add(Src)
+                  .addImm(0)
+                  .addReg(0);
+      break;
+    }
+    CASE_NF(SHL32ri) {
+      assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+      unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+      if (!isTruncatedShiftCountForLEA(ShAmt))
+        return nullptr;
 
-    // LEA can't handle ESP.
-    bool isKill;
-    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
-                        isKill, ImplicitOp, LV, LIS))
-      return nullptr;
+      unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
-    MachineInstrBuilder MIB =
-        BuildMI(MF, MI.getDebugLoc(), get(Opc))
-            .add(Dest)
-            .addReg(0)
-            .addImm(1LL << ShAmt)
-            .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
-            .addImm(0)
-            .addReg(0);
-    if (ImplicitOp.getReg() != 0)
-      MIB.add(ImplicitOp);
-    NewMI = MIB;
+      // LEA can't handle ESP.
+      bool isKill;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+                          isKill, ImplicitOp, LV, LIS))
+        return nullptr;
 
-    // Add kills if classifyLEAReg created a new register.
-    if (LV && SrcReg != Src.getReg())
-      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
-    break;
-  }
-  CASE_NF(SHL8ri)
+      MachineInstrBuilder MIB =
+          BuildMI(MF, MI.getDebugLoc(), get(Opc))
+              .add(Dest)
+              .addReg(0)
+              .addImm(1LL << ShAmt)
+              .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
+              .addImm(0)
+              .addReg(0);
+      if (ImplicitOp.getReg() != 0)
+        MIB.add(ImplicitOp);
+      NewMI = MIB;
+
+      // Add kills if classifyLEAReg created a new register.
+      if (LV && SrcReg != Src.getReg())
+        LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+      break;
+    }
+    CASE_NF(SHL8ri)
     Is8BitOp = true;
     [[fallthrough]];
-  CASE_NF(SHL16ri) {
-    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
-    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt))
-      return nullptr;
-    return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
-  }
-  CASE_NF(INC64r)
-  CASE_NF(INC32r) {
-    assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
-    unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
-                       ? X86::LEA64r
-                       : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
-    bool isKill;
-    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
-                        isKill, ImplicitOp, LV, LIS))
-      return nullptr;
-
-    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-                                  .add(Dest)
-                                  .addReg(SrcReg, getKillRegState(isKill));
-    if (ImplicitOp.getReg() != 0)
-      MIB.add(ImplicitOp);
+    CASE_NF(SHL16ri) {
+      assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
+      unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+      if (!isTruncatedShiftCountForLEA(ShAmt))
+        return nullptr;
+      return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
+    }
+    CASE_NF(INC64r)
+    CASE_NF(INC32r) {
+      assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
+      unsigned Opc = (MIOpc == X86::INC64r || MIOpc == X86::INC64r_NF)
+                         ? X86::LEA64r
+                         : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      bool isKill;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+                          isKill, ImplicitOp, LV, LIS))
+        return nullptr;
 
-    NewMI = addOffset(MIB, 1);
+      MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+                                    .add(Dest)
+                                    .addReg(SrcReg, getKillRegState(isKill));
+      if (ImplicitOp.getReg() != 0)
+        MIB.add(ImplicitOp);
 
-    // Add kills if classifyLEAReg created a new register.
-    if (LV && SrcReg != Src.getReg())
-      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
-    break;
-  }
-  CASE_NF(DEC64r)
-  CASE_NF(DEC32r) {
-    assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
-    unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
-                       ? X86::LEA64r
-                       : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      NewMI = addOffset(MIB, 1);
 
-    bool isKill;
-    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
-                        isKill, ImplicitOp, LV, LIS))
-      return nullptr;
+      // Add kills if classifyLEAReg created a new register.
+      if (LV && SrcReg != Src.getReg())
+        LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+      break;
+    }
+    CASE_NF(DEC64r)
+    CASE_NF(DEC32r) {
+      assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
+      unsigned Opc = (MIOpc == X86::DEC64r || MIOpc == X86::DEC64r_NF)
+                         ? X86::LEA64r
+                         : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
+
+      bool isKill;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+                          isKill, ImplicitOp, LV, LIS))
+        return nullptr;
 
-    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-                                  .add(Dest)
-                                  .addReg(SrcReg, getKillRegState(isKill));
-    if (ImplicitOp.getReg() != 0)
-      MIB.add(ImplicitOp);
+      MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+                                    .add(Dest)
+                                    .addReg(SrcReg, getKillRegState(isKill));
+      if (ImplicitOp.getReg() != 0)
+        MIB.add(ImplicitOp);
 
-    NewMI = addOffset(MIB, -1);
+      NewMI = addOffset(MIB, -1);
 
-    // Add kills if classifyLEAReg created a new register.
-    if (LV && SrcReg != Src.getReg())
-      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
-    break;
-  }
-  CASE_NF(DEC8r)
-  CASE_NF(INC8r)
+      // Add kills if classifyLEAReg created a new register.
+      if (LV && SrcReg != Src.getReg())
+        LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+      break;
+    }
+    CASE_NF(DEC8r)
+    CASE_NF(INC8r)
     Is8BitOp = true;
     [[fallthrough]];
-  CASE_NF(DEC16r)
-  CASE_NF(INC16r)
+    CASE_NF(DEC16r)
+    CASE_NF(INC16r)
     return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
-  CASE_NF(ADD64rr)
-  CASE_NF(ADD32rr)
+    CASE_NF(ADD64rr)
+    CASE_NF(ADD32rr)
   case X86::ADD64rr_DB:
   case X86::ADD32rr_DB: {
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
@@ -2161,21 +2167,21 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
     NumRegOperands = 3;
     break;
   }
-  CASE_NF(ADD8rr)
+    CASE_NF(ADD8rr)
   case X86::ADD8rr_DB:
     Is8BitOp = true;
     [[fallthrough]];
-  CASE_NF(ADD16rr)
+    CASE_NF(ADD16rr)
   case X86::ADD16rr_DB:
     return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
-  CASE_NF(ADD64ri32)
+    CASE_NF(ADD64ri32)
   case X86::ADD64ri32_DB:
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
     NewMI = addOffset(
         BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
         MI.getOperand(2));
     break;
-  CASE_NF(ADD32ri)
+    CASE_NF(ADD32ri)
   case X86::ADD32ri_DB: {
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
     unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
@@ -2200,62 +2206,62 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
       LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
     break;
   }
-  CASE_NF(ADD8ri)
+    CASE_NF(ADD8ri)
   case X86::ADD8ri_DB:
     Is8BitOp = true;
     [[fallthrough]];
-  CASE_NF(ADD16ri)
+    CASE_NF(ADD16ri)
   case X86::ADD16ri_DB:
     return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
-  CASE_NF(SUB8ri)
-  CASE_NF(SUB16ri)
+    CASE_NF(SUB8ri)
+    CASE_NF(SUB16ri)
     /// FIXME: Support these similar to ADD8ri/ADD16ri*.
     return nullptr;
-  CASE_NF(SUB32ri) {
-    if (!MI.getOperand(2).isImm())
-      return nullptr;
-    int64_t Imm = MI.getOperand(2).getImm();
-    if (!isInt<32>(-Imm))
-      return nullptr;
+    CASE_NF(SUB32ri) {
+      if (!MI.getOperand(2).isImm())
+        return nullptr;
+      int64_t Imm = MI.getOperand(2).getImm();
+      if (!isInt<32>(-Imm))
+        return nullptr;
 
-    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
-    unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
+      assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
-    bool isKill;
-    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
-                        isKill, ImplicitOp, LV, LIS))
-      return nullptr;
+      bool isKill;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
+                          isKill, ImplicitOp, LV, LIS))
+        return nullptr;
 
-    MachineInstrBuilder MIB =
-        BuildMI(MF, MI.getDebugLoc(), get(Opc))
-            .add(Dest)
-            .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
-    if (ImplicitOp.getReg() != 0)
-      MIB.add(ImplicitOp);
+      MachineInstrBuilder MIB =
+          BuildMI(MF, MI.getDebugLoc(), get(Opc))
+              .add(Dest)
+              .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
+      if (ImplicitOp.getReg() != 0)
+        MIB.add(ImplicitOp);
 
-    NewMI = addOffset(MIB, -Imm);
+      NewMI = addOffset(MIB, -Imm);
 
-    // Add kills if classifyLEAReg created a new register.
-    if (LV && SrcReg != Src.getReg())
-      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
-    break;
-  }
+      // Add kills if classifyLEAReg created a new register.
+      if (LV && SrcReg != Src.getReg())
+        LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
+      break;
+    }
 
-  CASE_NF(SUB64ri32) {
-    if (!MI.getOperand(2).isImm())
-      return nullptr;
-    int64_t Imm = MI.getOperand(2).getImm();
-    if (!isInt<32>(-Imm))
-      return nullptr;
+    CASE_NF(SUB64ri32) {
+      if (!MI.getOperand(2).isImm())
+        return nullptr;
+      int64_t Imm = MI.getOperand(2).getImm();
+      if (!isInt<32>(-Imm))
+        return nullptr;
 
-    assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
+      assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
 
-    MachineInstrBuilder MIB =
-        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
-    NewMI = addOffset(MIB, -Imm);
-    break;
-  }
+      MachineInstrBuilder MIB =
+          BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
+      NewMI = addOffset(MIB, -Imm);
+      break;
+    }
 
   case X86::VMOVDQU8Z128rmk:
   case X86::VMOVDQU8Z256rmk:
@@ -2855,17 +2861,17 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::OP##_ND:
 
   switch (Opc) {
-  // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
-  CASE_ND(SHRD16rri8)
-  CASE_ND(SHLD16rri8)
-  CASE_ND(SHRD32rri8)
-  CASE_ND(SHLD32rri8)
-  CASE_ND(SHRD64rri8)
-  CASE_ND(SHLD64rri8) {
-    unsigned Size;
-    switch (Opc) {
-    default:
-      llvm_unreachable("Unreachable!");
+    // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
+    CASE_ND(SHRD16rri8)
+    CASE_ND(SHLD16rri8)
+    CASE_ND(SHRD32rri8)
+    CASE_ND(SHLD32rri8)
+    CASE_ND(SHRD64rri8)
+    CASE_ND(SHLD64rri8) {
+      unsigned Size;
+      switch (Opc) {
+      default:
+        llvm_unreachable("Unreachable!");
 #define FROM_TO_SIZE(A, B, S)                                                  \
   case X86::A:                                                                 \
     Opc = X86::B;                                                              \
@@ -2884,16 +2890,16 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     Size = S;                                                                  \
     break;
 
-    FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
-    FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
-    FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
+        FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
+        FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
+        FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
 #undef FROM_TO_SIZE
+      }
+      WorkingMI = CloneIfNew(MI);
+      WorkingMI->setDesc(get(Opc));
+      WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
+      break;
     }
-    WorkingMI = CloneIfNew(MI);
-    WorkingMI->setDesc(get(Opc));
-    WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
-    break;
-  }
   case X86::PFSUBrr:
   case X86::PFSUBRrr:
     // PFSUB  x, y: x = x - y
@@ -3177,15 +3183,16 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     WorkingMI = CloneIfNew(MI);
     WorkingMI->setDesc(get(Opc));
     break;
-  CASE_ND(CMOV16rr)
-  CASE_ND(CMOV32rr)
-  CASE_ND(CMOV64rr) {
-    WorkingMI = CloneIfNew(MI);
-    unsigned OpNo = MI.getDesc().getNumOperands() - 1;
-    X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
-    WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
-    break;
-  }
+    CASE_ND(CMOV16rr)
+    CASE_ND(CMOV32rr)
+    CASE_ND(CMOV64rr) {
+      WorkingMI = CloneIfNew(MI);
+      unsigned OpNo = MI.getDesc().getNumOperands() - 1;
+      X86::CondCode CC =
+          static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
+      WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
+      break;
+    }
   case X86::VPTERNLOGDZrri:
   case X86::VPTERNLOGDZrmi:
   case X86::VPTERNLOGDZ128rri:
@@ -5393,29 +5400,29 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
       CmpMask = CmpValue = 0;
     }
     return true;
-  // A SUB can be used to perform comparison.
-  CASE_ND(SUB64rm)
-  CASE_ND(SUB32rm)
-  CASE_ND(SUB16rm)
-  CASE_ND(SUB8rm)
+    // A SUB can be used to perform comparison.
+    CASE_ND(SUB64rm)
+    CASE_ND(SUB32rm)
+    CASE_ND(SUB16rm)
+    CASE_ND(SUB8rm)
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = 0;
     CmpValue = 0;
     return true;
-  CASE_ND(SUB64rr)
-  CASE_ND(SUB32rr)
-  CASE_ND(SUB16rr)
-  CASE_ND(SUB8rr)
+    CASE_ND(SUB64rr)
+    CASE_ND(SUB32rr)
+    CASE_ND(SUB16rr)
+    CASE_ND(SUB8rr)
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = MI.getOperand(2).getReg();
     CmpMask = 0;
     CmpValue = 0;
     return true;
-  CASE_ND(SUB64ri32)
-  CASE_ND(SUB32ri)
-  CASE_ND(SUB16ri)
-  CASE_ND(SUB8ri)
+    CASE_ND(SUB64ri32)
+    CASE_ND(SUB32ri)
+    CASE_ND(SUB16ri)
+    CASE_ND(SUB8ri)
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     if (MI.getOperand(2).isImm()) {
@@ -5470,27 +5477,27 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
   case X86::CMP32rr:
   case X86::CMP16rr:
   case X86::CMP8rr:
-  CASE_ND(SUB64rr)
-  CASE_ND(SUB32rr)
-  CASE_ND(SUB16rr)
-  CASE_ND(SUB8rr) {
-    Register OISrcReg;
-    Register OISrcReg2;
-    int64_t OIMask;
-    int64_t OIValue;
-    if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
-        OIMask != ImmMask || OIValue != ImmValue)
+    CASE_ND(SUB64rr)
+    CASE_ND(SUB32rr)
+    CASE_ND(SUB16rr)
+    CASE_ND(SUB8rr) {
+      Register OISrcReg;
+      Register OISrcReg2;
+      int64_t OIMask;
+      int64_t OIValue;
+      if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
+          OIMask != ImmMask || OIValue != ImmValue)
+        return false;
+      if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
+        *IsSwapped = false;
+        return true;
+      }
+      if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
+        *IsSwapped = true;
+        return true;
+      }
       return false;
-    if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
-      *IsSwapped = false;
-      return true;
-    }
-    if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
-      *IsSwapped = true;
-      return true;
     }
-    return false;
-  }
   case X86::CMP64ri32:
   case X86::CMP32ri:
   case X86::CMP16ri:
@@ -5499,10 +5506,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
   case X86::TEST32ri:
   case X86::TEST16ri:
   case X86::TEST8ri:
-  CASE_ND(SUB64ri32)
-  CASE_ND(SUB32ri)
-  CASE_ND(SUB16ri)
-  CASE_ND(SUB8ri)
+    CASE_ND(SUB64ri32)
+    CASE_ND(SUB32ri)
+    CASE_ND(SUB16ri)
+    CASE_ND(SUB8ri)
   case X86::TEST64rr:
   case X86::TEST32rr:
   case X86::TEST16rr:
@@ -5559,98 +5566,98 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
   default:
     return false;
 
-  // The shift instructions only modify ZF if their shift count is non-zero.
-  // N.B.: The processor truncates the shift count depending on the encoding.
-  CASE_ND(SAR8ri)
-  CASE_ND(SAR16ri)
-  CASE_ND(SAR32ri)
-  CASE_ND(SAR64ri)
-  CASE_ND(SHR8ri)
-  CASE_ND(SHR16ri)
-  CASE_ND(SHR32ri)
-  CASE_ND(SHR64ri)
+    // The shift instructions only modify ZF if their shift count is non-zero.
+    // N.B.: The processor truncates the shift count depending on the encoding.
+    CASE_ND(SAR8ri)
+    CASE_ND(SAR16ri)
+    CASE_ND(SAR32ri)
+    CASE_ND(SAR64ri)
+    CASE_ND(SHR8ri)
+    CASE_ND(SHR16ri)
+    CASE_ND(SHR32ri)
+    CASE_ND(SHR64ri)
     return getTruncatedShiftCount(MI, 2) != 0;
 
-  // Some left shift instructions can be turned into LEA instructions but only
-  // if their flags aren't used. Avoid transforming such instructions.
-  CASE_ND(SHL8ri)
-  CASE_ND(SHL16ri)
-  CASE_ND(SHL32ri)
-  CASE_ND(SHL64ri) {
-    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (isTruncatedShiftCountForLEA(ShAmt))
-      return false;
-    return ShAmt != 0;
-  }
+    // Some left shift instructions can be turned into LEA instructions but only
+    // if their flags aren't used. Avoid transforming such instructions.
+    CASE_ND(SHL8ri)
+    CASE_ND(SHL16ri)
+    CASE_ND(SHL32ri)
+    CASE_ND(SHL64ri) {
+      unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+      if (isTruncatedShiftCountForLEA(ShAmt))
+        return false;
+      return ShAmt != 0;
+    }
 
-  CASE_ND(SHRD16rri8)
-  CASE_ND(SHRD32rri8)
-  CASE_ND(SHRD64rri8)
-  CASE_ND(SHLD16rri8)
-  CASE_ND(SHLD32rri8)
-  CASE_ND(SHLD64rri8)
+    CASE_ND(SHRD16rri8)
+    CASE_ND(SHRD32rri8)
+    CASE_ND(SHRD64rri8)
+    CASE_ND(SHLD16rri8)
+    CASE_ND(SHLD32rri8)
+    CASE_ND(SHLD64rri8)
     return getTruncatedShiftCount(MI, 3) != 0;
 
-  CASE_ND(SUB64ri32)
-  CASE_ND(SUB32ri)
-  CASE_ND(SUB16ri)
-  CASE_ND(SUB8ri)
-  CASE_ND(SUB64rr)
-  CASE_ND(SUB32rr)
-  CASE_ND(SUB16rr)
-  CASE_ND(SUB8rr)
-  CASE_ND(SUB64rm)
-  CASE_ND(SUB32rm)
-  CASE_ND(SUB16rm)
-  CASE_ND(SUB8rm)
-  CASE_ND(DEC64r)
-  CASE_ND(DEC32r)
-  CASE_ND(DEC16r)
-  CASE_ND(DEC8r)
-  CASE_ND(ADD64ri32)
-  CASE_ND(ADD32ri)
-  CASE_ND(ADD16ri)
-  CASE_ND(ADD8ri)
-  CASE_ND(ADD64rr)
-  CASE_ND(ADD32rr)
-  CASE_ND(ADD16rr)
-  CASE_ND(ADD8rr)
-  CASE_ND(ADD64rm)
-  CASE_ND(ADD32rm)
-  CASE_ND(ADD16rm)
-  CASE_ND(ADD8rm)
-  CASE_ND(INC64r)
-  CASE_ND(INC32r)
-  CASE_ND(INC16r)
-  CASE_ND(INC8r)
-  CASE_ND(ADC64ri32)
-  CASE_ND(ADC32ri)
-  CASE_ND(ADC16ri)
-  CASE_ND(ADC8ri)
-  CASE_ND(ADC64rr)
-  CASE_ND(ADC32rr)
-  CASE_ND(ADC16rr)
-  CASE_ND(ADC8rr)
-  CASE_ND(ADC64rm)
-  CASE_ND(ADC32rm)
-  CASE_ND(ADC16rm)
-  CASE_ND(ADC8rm)
-  CASE_ND(SBB64ri32)
-  CASE_ND(SBB32ri)
-  CASE_ND(SBB16ri)
-  CASE_ND(SBB8ri)
-  CASE_ND(SBB64rr)
-  CASE_ND(SBB32rr)
-  CASE_ND(SBB16rr)
-  CASE_ND(SBB8rr)
-  CASE_ND(SBB64rm)
-  CASE_ND(SBB32rm)
-  CASE_ND(SBB16rm)
-  CASE_ND(SBB8rm)
-  CASE_ND(NEG8r)
-  CASE_ND(NEG16r)
-  CASE_ND(NEG32r)
-  CASE_ND(NEG64r)
+    CASE_ND(SUB64ri32)
+    CASE_ND(SUB32ri)
+    CASE_ND(SUB16ri)
+    CASE_ND(SUB8ri)
+    CASE_ND(SUB64rr)
+    CASE_ND(SUB32rr)
+    CASE_ND(SUB16rr)
+    CASE_ND(SUB8rr)
+    CASE_ND(SUB64rm)
+    CASE_ND(SUB32rm)
+    CASE_ND(SUB16rm)
+    CASE_ND(SUB8rm)
+    CASE_ND(DEC64r)
+    CASE_ND(DEC32r)
+    CASE_ND(DEC16r)
+    CASE_ND(DEC8r)
+    CASE_ND(ADD64ri32)
+    CASE_ND(ADD32ri)
+    CASE_ND(ADD16ri)
+    CASE_ND(ADD8ri)
+    CASE_ND(ADD64rr)
+    CASE_ND(ADD32rr)
+    CASE_ND(ADD16rr)
+    CASE_ND(ADD8rr)
+    CASE_ND(ADD64rm)
+    CASE_ND(ADD32rm)
+    CASE_ND(ADD16rm)
+    CASE_ND(ADD8rm)
+    CASE_ND(INC64r)
+    CASE_ND(INC32r)
+    CASE_ND(INC16r)
+    CASE_ND(INC8r)
+    CASE_ND(ADC64ri32)
+    CASE_ND(ADC32ri)
+    CASE_ND(ADC16ri)
+    CASE_ND(ADC8ri)
+    CASE_ND(ADC64rr)
+    CASE_ND(ADC32rr)
+    CASE_ND(ADC16rr)
+    CASE_ND(ADC8rr)
+    CASE_ND(ADC64rm)
+    CASE_ND(ADC32rm)
+    CASE_ND(ADC16rm)
+    CASE_ND(ADC8rm)
+    CASE_ND(SBB64ri32)
+    CASE_ND(SBB32ri)
+    CASE_ND(SBB16ri)
+    CASE_ND(SBB8ri)
+    CASE_ND(SBB64rr)
+    CASE_ND(SBB32rr)
+    CASE_ND(SBB16rr)
+    CASE_ND(SBB8rr)
+    CASE_ND(SBB64rm)
+    CASE_ND(SBB32rm)
+    CASE_ND(SBB16rm)
+    CASE_ND(SBB8rm)
+    CASE_ND(NEG8r)
+    CASE_ND(NEG16r)
+    CASE_ND(NEG32r)
+    CASE_ND(NEG64r)
   case X86::LZCNT16rr:
   case X86::LZCNT16rm:
   case X86::LZCNT32rr:
@@ -5670,42 +5677,42 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
   case X86::TZCNT64rr:
   case X86::TZCNT64rm:
     return true;
-  CASE_ND(AND64ri32)
-  CASE_ND(AND32ri)
-  CASE_ND(AND16ri)
-  CASE_ND(AND8ri)
-  CASE_ND(AND64rr)
-  CASE_ND(AND32rr)
-  CASE_ND(AND16rr)
-  CASE_ND(AND8rr)
-  CASE_ND(AND64rm)
-  CASE_ND(AND32rm)
-  CASE_ND(AND16rm)
-  CASE_ND(AND8rm)
-  CASE_ND(XOR64ri32)
-  CASE_ND(XOR32ri)
-  CASE_ND(XOR16ri)
-  CASE_ND(XOR8ri)
-  CASE_ND(XOR64rr)
-  CASE_ND(XOR32rr)
-  CASE_ND(XOR16rr)
-  CASE_ND(XOR8rr)
-  CASE_ND(XOR64rm)
-  CASE_ND(XOR32rm)
-  CASE_ND(XOR16rm)
-  CASE_ND(XOR8rm)
-  CASE_ND(OR64ri32)
-  CASE_ND(OR32ri)
-  CASE_ND(OR16ri)
-  CASE_ND(OR8ri)
-  CASE_ND(OR64rr)
-  CASE_ND(OR32rr)
-  CASE_ND(OR16rr)
-  CASE_ND(OR8rr)
-  CASE_ND(OR64rm)
-  CASE_ND(OR32rm)
-  CASE_ND(OR16rm)
-  CASE_ND(OR8rm)
+    CASE_ND(AND64ri32)
+    CASE_ND(AND32ri)
+    CASE_ND(AND16ri)
+    CASE_ND(AND8ri)
+    CASE_ND(AND64rr)
+    CASE_ND(AND32rr)
+    CASE_ND(AND16rr)
+    CASE_ND(AND8rr)
+    CASE_ND(AND64rm)
+    CASE_ND(AND32rm)
+    CASE_ND(AND16rm)
+    CASE_ND(AND8rm)
+    CASE_ND(XOR64ri32)
+    CASE_ND(XOR32ri)
+    CASE_ND(XOR16ri)
+    CASE_ND(XOR8ri)
+    CASE_ND(XOR64rr)
+    CASE_ND(XOR32rr)
+    CASE_ND(XOR16rr)
+    CASE_ND(XOR8rr)
+    CASE_ND(XOR64rm)
+    CASE_ND(XOR32rm)
+    CASE_ND(XOR16rm)
+    CASE_ND(XOR8rm)
+    CASE_ND(OR64ri32)
+    CASE_ND(OR32ri)
+    CASE_ND(OR16ri)
+    CASE_ND(OR8ri)
+    CASE_ND(OR64rr)
+    CASE_ND(OR32rr)
+    CASE_ND(OR16rr)
+    CASE_ND(OR8rr)
+    CASE_ND(OR64rm)
+    CASE_ND(OR32rm)
+    CASE_ND(OR16rm)
+    CASE_ND(OR8rm)
   case X86::ANDN32rr:
   case X86::ANDN32rm:
   case X86::ANDN64rr:
@@ -5783,15 +5790,17 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
 }
 
 /// Check whether the use can be converted to remove a comparison against zero.
-/// Returns the EFLAGS condition and the operand that we are comparing against zero.
-static std::pair<X86::CondCode, unsigned> isUseDefConvertible(const MachineInstr &MI) {
+/// Returns the EFLAGS condition and the operand that we are comparing against
+/// zero.
+static std::pair<X86::CondCode, unsigned>
+isUseDefConvertible(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     return std::make_pair(X86::COND_INVALID, ~0U);
-  CASE_ND(NEG8r)
-  CASE_ND(NEG16r)
-  CASE_ND(NEG32r)
-  CASE_ND(NEG64r)
+    CASE_ND(NEG8r)
+    CASE_ND(NEG16r)
+    CASE_ND(NEG32r)
+    CASE_ND(NEG64r)
     return std::make_pair(X86::COND_AE, 1U);
   case X86::LZCNT16rr:
   case X86::LZCNT32rr:
@@ -5835,51 +5844,53 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   switch (CmpInstr.getOpcode()) {
   default:
     break;
-  CASE_ND(SUB64ri32)
-  CASE_ND(SUB32ri)
-  CASE_ND(SUB16ri)
-  CASE_ND(SUB8ri)
-  CASE_ND(SUB64rm)
-  CASE_ND(SUB32rm)
-  CASE_ND(SUB16rm)
-  CASE_ND(SUB8rm)
-  CASE_ND(SUB64rr)
-  CASE_ND(SUB32rr)
-  CASE_ND(SUB16rr)
-  CASE_ND(SUB8rr) {
-    if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
-      return false;
-    // There is no use of the destination register, we can replace SUB with CMP.
-    unsigned NewOpcode = 0;
+    CASE_ND(SUB64ri32)
+    CASE_ND(SUB32ri)
+    CASE_ND(SUB16ri)
+    CASE_ND(SUB8ri)
+    CASE_ND(SUB64rm)
+    CASE_ND(SUB32rm)
+    CASE_ND(SUB16rm)
+    CASE_ND(SUB8rm)
+    CASE_ND(SUB64rr)
+    CASE_ND(SUB32rr)
+    CASE_ND(SUB16rr)
+    CASE_ND(SUB8rr) {
+      if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
+        return false;
+      // There is no use of the destination register, we can replace SUB with
+      // CMP.
+      unsigned NewOpcode = 0;
 #define FROM_TO(A, B)                                                          \
   CASE_ND(A) NewOpcode = X86::B;                                               \
   break;
-    switch (CmpInstr.getOpcode()) {
-    default:
-      llvm_unreachable("Unreachable!");
-    FROM_TO(SUB64rm, CMP64rm)
-    FROM_TO(SUB32rm, CMP32rm)
-    FROM_TO(SUB16rm, CMP16rm)
-    FROM_TO(SUB8rm, CMP8rm)
-    FROM_TO(SUB64rr, CMP64rr)
-    FROM_TO(SUB32rr, CMP32rr)
-    FROM_TO(SUB16rr, CMP16rr)
-    FROM_TO(SUB8rr, CMP8rr)
-    FROM_TO(SUB64ri32, CMP64ri32)
-    FROM_TO(SUB32ri, CMP32ri)
-    FROM_TO(SUB16ri, CMP16ri)
-    FROM_TO(SUB8ri, CMP8ri)
-    }
+      switch (CmpInstr.getOpcode()) {
+      default:
+        llvm_unreachable("Unreachable!");
+        FROM_TO(SUB64rm, CMP64rm)
+        FROM_TO(SUB32rm, CMP32rm)
+        FROM_TO(SUB16rm, CMP16rm)
+        FROM_TO(SUB8rm, CMP8rm)
+        FROM_TO(SUB64rr, CMP64rr)
+        FROM_TO(SUB32rr, CMP32rr)
+        FROM_TO(SUB16rr, CMP16rr)
+        FROM_TO(SUB8rr, CMP8rr)
+        FROM_TO(SUB64ri32, CMP64ri32)
+        FROM_TO(SUB32ri, CMP32ri)
+        FROM_TO(SUB16ri, CMP16ri)
+        FROM_TO(SUB8ri, CMP8ri)
+      }
 #undef FROM_TO
-    CmpInstr.setDesc(get(NewOpcode));
-    CmpInstr.removeOperand(0);
-    // Mutating this instruction invalidates any debug data associated with it.
-    CmpInstr.dropDebugNumber();
-    // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
-    if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
-        NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
-      return false;
-  }
+      CmpInstr.setDesc(get(NewOpcode));
+      CmpInstr.removeOperand(0);
+      // Mutating this instruction invalidates any debug data associated with
+      // it.
+      CmpInstr.dropDebugNumber();
+      // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+      if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+          NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+        return false;
+    }
   }
 
   // The following code tries to remove the comparison by re-using EFLAGS
@@ -6236,14 +6247,14 @@ static bool canConvert2Copy(unsigned Opc) {
   switch (Opc) {
   default:
     return false;
-  CASE_ND(ADD64ri32)
-  CASE_ND(SUB64ri32)
-  CASE_ND(OR64ri32)
-  CASE_ND(XOR64ri32)
-  CASE_ND(ADD32ri)
-  CASE_ND(SUB32ri)
-  CASE_ND(OR32ri)
-  CASE_ND(XOR32ri)
+    CASE_ND(ADD64ri32)
+    CASE_ND(SUB64ri32)
+    CASE_ND(OR64ri32)
+    CASE_ND(XOR64ri32)
+    CASE_ND(ADD32ri)
+    CASE_ND(SUB32ri)
+    CASE_ND(OR32ri)
+    CASE_ND(XOR32ri)
     return true;
   }
 }
@@ -9627,7 +9638,7 @@ Register X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
 
 static const uint16_t *lookup(unsigned opcode, unsigned domain,
                               ArrayRef<uint16_t[3]> Table) {
-  for (const uint16_t(&Row)[3] : Table)
+  for (const uint16_t (&Row)[3] : Table)
     if (Row[domain - 1] == opcode)
       return Row;
   return nullptr;
@@ -9636,7 +9647,7 @@ static const uint16_t *lookup(unsigned opcode, unsigned domain,
 static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
                                     ArrayRef<uint16_t[4]> Table) {
   // If this is the integer domain make sure to check both integer columns.
-  for (const uint16_t(&Row)[4] : Table)
+  for (const uint16_t (&Row)[4] : Table)
     if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
       return Row;
   return nullptr;
@@ -10392,25 +10403,25 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
   if (Invert)
     return false;
   switch (Inst.getOpcode()) {
-  CASE_ND(ADD8rr)
-  CASE_ND(ADD16rr)
-  CASE_ND(ADD32rr)
-  CASE_ND(ADD64rr)
-  CASE_ND(AND8rr)
-  CASE_ND(AND16rr)
-  CASE_ND(AND32rr)
-  CASE_ND(AND64rr)
-  CASE_ND(OR8rr)
-  CASE_ND(OR16rr)
-  CASE_ND(OR32rr)
-  CASE_ND(OR64rr)
-  CASE_ND(XOR8rr)
-  CASE_ND(XOR16rr)
-  CASE_ND(XOR32rr)
-  CASE_ND(XOR64rr)
-  CASE_ND(IMUL16rr)
-  CASE_ND(IMUL32rr)
-  CASE_ND(IMUL64rr)
+    CASE_ND(ADD8rr)
+    CASE_ND(ADD16rr)
+    CASE_ND(ADD32rr)
+    CASE_ND(ADD64rr)
+    CASE_ND(AND8rr)
+    CASE_ND(AND16rr)
+    CASE_ND(AND32rr)
+    CASE_ND(AND64rr)
+    CASE_ND(OR8rr)
+    CASE_ND(OR16rr)
+    CASE_ND(OR32rr)
+    CASE_ND(OR64rr)
+    CASE_ND(XOR8rr)
+    CASE_ND(XOR16rr)
+    CASE_ND(XOR32rr)
+    CASE_ND(XOR64rr)
+    CASE_ND(IMUL16rr)
+    CASE_ND(IMUL32rr)
+    CASE_ND(IMUL64rr)
   case X86::PANDrr:
   case X86::PORrr:
   case X86::PXORrr:
@@ -11451,8 +11462,8 @@ bool X86InstrInfo::getMachineCombinerPatterns(
     break;
   }
   }
-  return TargetInstrInfo::getMachineCombinerPatterns(Root,
-                                                     Patterns, DoRegPressureReduce);
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
+                                                     DoRegPressureReduce);
 }
 
 static void
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index ebd7e070d5fe8..93fcfa2f288f3 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -320,8 +320,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
 
   Register isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
-  Register isLoadFromStackSlot(const MachineInstr &MI,
-                               int &FrameIndex,
+  Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex,
                                TypeSize &MemBytes) const override;
   /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
@@ -331,8 +330,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
 
   Register isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
-  Register isStoreToStackSlot(const MachineInstr &MI,
-                              int &FrameIndex,
+  Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex,
                               TypeSize &MemBytes) const override;
   /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
@@ -494,12 +492,12 @@ class X86InstrInfo final : public X86GenInstrInfo {
   /// is likely that the referenced instruction has been changed.
   ///
   /// \returns true on success.
-  MachineInstr *
-  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
-                        ArrayRef<unsigned> Ops,
-                        MachineBasicBlock::iterator InsertPt, int FrameIndex,
-                        LiveIntervals *LIS = nullptr,
-                        VirtRegMap *VRM = nullptr) const override;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                                      ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      int FrameIndex,
+                                      LiveIntervals *LIS = nullptr,
+                                      VirtRegMap *VRM = nullptr) const override;
 
   /// Same as the previous version except it allows folding of any load and
   /// store from / to any address, not just from a specific stack slot.
@@ -748,8 +746,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
   ///
   /// If IsIntrinsic is set, operand 1 will be ignored for commuting.
   bool findThreeSrcCommutedOpIndices(const MachineInstr &MI,
-                                     unsigned &SrcOpIdx1,
-                                     unsigned &SrcOpIdx2,
+                                     unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
                                      bool IsIntrinsic = false) const;
 
   /// Returns true when instruction \p FlagI produces the same flags as \p OI.
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 66c9d75053640..33b5ae0eb8f7a 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -54,9 +54,10 @@
 
 using namespace llvm;
 
-static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
-                               cl::desc("Enable the machine combiner pass"),
-                               cl::init(true), cl::Hidden);
+static cl::opt<bool>
+    EnableMachineCombinerPass("x86-machine-combiner",
+                              cl::desc("Enable the machine combiner pass"),
+                              cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableTileRAPass("x86-tile-ra",
@@ -362,7 +363,7 @@ namespace {
 class X86PassConfig : public TargetPassConfig {
 public:
   X86PassConfig(X86TargetMachine &TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {}
 
   X86TargetMachine &getX86TargetMachine() const {
     return getTM<X86TargetMachine>();
@@ -401,10 +402,10 @@ char X86ExecutionDomainFix::ID;
 } // end anonymous namespace
 
 INITIALIZE_PASS_BEGIN(X86ExecutionDomainFix, "x86-execution-domain-fix",
-  "X86 Execution Domain Fix", false, false)
+                      "X86 Execution Domain Fix", false, false)
 INITIALIZE_PASS_DEPENDENCY(ReachingDefInfoWrapperPass)
 INITIALIZE_PASS_END(X86ExecutionDomainFix, "x86-execution-domain-fix",
-  "X86 Execution Domain Fix", false, false)
+                    "X86 Execution Domain Fix", false, false)
 
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new X86PassConfig(*this, PM);
@@ -621,7 +622,7 @@ void X86PassConfig::addPreEmitPass2() {
            (TT.isOSDarwin() &&
             (M->getFunction("objc_retainAutoreleasedReturnValue") ||
              M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"))) ||
-             F.hasFnAttribute("ct-select");
+           F.hasFnAttribute("ct-select");
   }));
 
   // Analyzes and emits pseudos to support Win x64 Unwind V2. This pass must run
diff --git a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
index ea943307c644f..eec38fa581c6f 100644
--- a/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
+++ b/llvm/test/CodeGen/X86/ctselect-i386-fp.ll
@@ -209,94 +209,84 @@ define double @test_ctselect_f64_basic(i1 %cond, double %a, double %b) nounwind
 define x86_fp80 @test_ctselect_f80_basic(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
 ; I386-NOCMOV-LABEL: test_ctselect_f80_basic:
 ; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    pushl %ebx
 ; I386-NOCMOV-NEXT:    pushl %edi
 ; I386-NOCMOV-NEXT:    pushl %esi
-; I386-NOCMOV-NEXT:    subl $12, %esp
-; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT:    sete %al
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    subl $40, %esp
+; I386-NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    testb $1, %cl
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT:    movb %al, %ah
-; I386-NOCMOV-NEXT:    movzbl %ah, %edi
-; I386-NOCMOV-NEXT:    negl %edi
-; I386-NOCMOV-NEXT:    movl %edx, %esi
-; I386-NOCMOV-NEXT:    andl %edi, %esi
-; I386-NOCMOV-NEXT:    notl %edi
-; I386-NOCMOV-NEXT:    andl %ecx, %edi
-; I386-NOCMOV-NEXT:    orl %edi, %esi
-; I386-NOCMOV-NEXT:    movl %esi, (%esp)
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT:    sete %ch
+; I386-NOCMOV-NEXT:    movb %ch, %al
+; I386-NOCMOV-NEXT:    movzbl %al, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %edi, %ebx
+; I386-NOCMOV-NEXT:    andl %ebp, %ebx
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %edx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %ebx
+; I386-NOCMOV-NEXT:    testb $1, %cl
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT:    movb %al, %ah
-; I386-NOCMOV-NEXT:    movzbl %ah, %edi
-; I386-NOCMOV-NEXT:    negl %edi
-; I386-NOCMOV-NEXT:    movl %edx, %esi
-; I386-NOCMOV-NEXT:    andl %edi, %esi
-; I386-NOCMOV-NEXT:    notl %edi
-; I386-NOCMOV-NEXT:    andl %ecx, %edi
-; I386-NOCMOV-NEXT:    orl %edi, %esi
-; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT:    movb %al, %ah
-; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    sete %ch
+; I386-NOCMOV-NEXT:    movb %ch, %cl
+; I386-NOCMOV-NEXT:    movzbl %cl, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %edx, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %eax, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; I386-NOCMOV-NEXT:    movb %al, %dl
+; I386-NOCMOV-NEXT:    movzbl %dl, %edi
 ; I386-NOCMOV-NEXT:    negl %edi
-; I386-NOCMOV-NEXT:    movl %edx, %esi
-; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, %ecx
+; I386-NOCMOV-NEXT:    andl %edi, %ecx
 ; I386-NOCMOV-NEXT:    notl %edi
-; I386-NOCMOV-NEXT:    andl %ecx, %edi
-; I386-NOCMOV-NEXT:    orl %edi, %esi
-; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT:    fldt (%esp)
-; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    andl %ebx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    addl $40, %esp
 ; I386-NOCMOV-NEXT:    popl %esi
 ; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    popl %ebp
 ; I386-NOCMOV-NEXT:    retl
 ;
 ; I386-CMOV-LABEL: test_ctselect_f80_basic:
 ; I386-CMOV:       # %bb.0:
-; I386-CMOV-NEXT:    pushl %edi
-; I386-CMOV-NEXT:    pushl %esi
-; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    subl $36, %esp
+; I386-CMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT:    movb %al, %ah
-; I386-CMOV-NEXT:    movzbl %ah, %edi
-; I386-CMOV-NEXT:    negl %edi
-; I386-CMOV-NEXT:    movl %edx, %esi
-; I386-CMOV-NEXT:    andl %edi, %esi
-; I386-CMOV-NEXT:    notl %edi
-; I386-CMOV-NEXT:    andl %ecx, %edi
-; I386-CMOV-NEXT:    orl %edi, %esi
-; I386-CMOV-NEXT:    movl %esi, (%esp)
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT:    movb %al, %ah
-; I386-CMOV-NEXT:    movzbl %ah, %edi
-; I386-CMOV-NEXT:    negl %edi
-; I386-CMOV-NEXT:    movl %edx, %esi
-; I386-CMOV-NEXT:    andl %edi, %esi
-; I386-CMOV-NEXT:    notl %edi
-; I386-CMOV-NEXT:    andl %ecx, %edi
-; I386-CMOV-NEXT:    orl %edi, %esi
-; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT:    movb %al, %ah
-; I386-CMOV-NEXT:    movzbl %ah, %edi
-; I386-CMOV-NEXT:    negl %edi
-; I386-CMOV-NEXT:    movl %edx, %esi
-; I386-CMOV-NEXT:    andl %edi, %esi
-; I386-CMOV-NEXT:    notl %edi
-; I386-CMOV-NEXT:    andl %ecx, %edi
-; I386-CMOV-NEXT:    orl %edi, %esi
-; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, (%esp)
 ; I386-CMOV-NEXT:    fldt (%esp)
-; I386-CMOV-NEXT:    addl $12, %esp
-; I386-CMOV-NEXT:    popl %esi
-; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    addl $36, %esp
 ; I386-CMOV-NEXT:    retl
   %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
   ret x86_fp80 %result
@@ -543,94 +533,84 @@ define float @test_ctselect_f32_nan(i1 %cond) nounwind {
 define x86_fp80 @test_ctselect_f80_alignment(i1 %cond, x86_fp80 %a, x86_fp80 %b) nounwind {
 ; I386-NOCMOV-LABEL: test_ctselect_f80_alignment:
 ; I386-NOCMOV:       # %bb.0:
+; I386-NOCMOV-NEXT:    pushl %ebp
+; I386-NOCMOV-NEXT:    pushl %ebx
 ; I386-NOCMOV-NEXT:    pushl %edi
 ; I386-NOCMOV-NEXT:    pushl %esi
-; I386-NOCMOV-NEXT:    subl $12, %esp
-; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT:    sete %al
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    subl $40, %esp
+; I386-NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    testb $1, %cl
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT:    movb %al, %ah
-; I386-NOCMOV-NEXT:    movzbl %ah, %edi
-; I386-NOCMOV-NEXT:    negl %edi
-; I386-NOCMOV-NEXT:    movl %edx, %esi
-; I386-NOCMOV-NEXT:    andl %edi, %esi
-; I386-NOCMOV-NEXT:    notl %edi
-; I386-NOCMOV-NEXT:    andl %ecx, %edi
-; I386-NOCMOV-NEXT:    orl %edi, %esi
-; I386-NOCMOV-NEXT:    movl %esi, (%esp)
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; I386-NOCMOV-NEXT:    sete %ch
+; I386-NOCMOV-NEXT:    movb %ch, %al
+; I386-NOCMOV-NEXT:    movzbl %al, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %edi, %ebx
+; I386-NOCMOV-NEXT:    andl %ebp, %ebx
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %edx, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %ebx
+; I386-NOCMOV-NEXT:    testb $1, %cl
+; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT:    movb %al, %ah
-; I386-NOCMOV-NEXT:    movzbl %ah, %edi
-; I386-NOCMOV-NEXT:    negl %edi
-; I386-NOCMOV-NEXT:    movl %edx, %esi
-; I386-NOCMOV-NEXT:    andl %edi, %esi
-; I386-NOCMOV-NEXT:    notl %edi
-; I386-NOCMOV-NEXT:    andl %ecx, %edi
-; I386-NOCMOV-NEXT:    orl %edi, %esi
-; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-NOCMOV-NEXT:    movb %al, %ah
-; I386-NOCMOV-NEXT:    movzbl %ah, %edi
+; I386-NOCMOV-NEXT:    sete %ch
+; I386-NOCMOV-NEXT:    movb %ch, %cl
+; I386-NOCMOV-NEXT:    movzbl %cl, %ebp
+; I386-NOCMOV-NEXT:    negl %ebp
+; I386-NOCMOV-NEXT:    movl %edx, %edi
+; I386-NOCMOV-NEXT:    andl %ebp, %edi
+; I386-NOCMOV-NEXT:    notl %ebp
+; I386-NOCMOV-NEXT:    andl %eax, %ebp
+; I386-NOCMOV-NEXT:    orl %ebp, %edi
+; I386-NOCMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    sete %al
+; I386-NOCMOV-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; I386-NOCMOV-NEXT:    movb %al, %dl
+; I386-NOCMOV-NEXT:    movzbl %dl, %edi
 ; I386-NOCMOV-NEXT:    negl %edi
-; I386-NOCMOV-NEXT:    movl %edx, %esi
-; I386-NOCMOV-NEXT:    andl %edi, %esi
+; I386-NOCMOV-NEXT:    movl %esi, %ecx
+; I386-NOCMOV-NEXT:    andl %edi, %ecx
 ; I386-NOCMOV-NEXT:    notl %edi
-; I386-NOCMOV-NEXT:    andl %ecx, %edi
-; I386-NOCMOV-NEXT:    orl %edi, %esi
-; I386-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; I386-NOCMOV-NEXT:    fldt (%esp)
-; I386-NOCMOV-NEXT:    addl $12, %esp
+; I386-NOCMOV-NEXT:    andl %ebx, %edi
+; I386-NOCMOV-NEXT:    orl %edi, %ecx
+; I386-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-NOCMOV-NEXT:    addl $40, %esp
 ; I386-NOCMOV-NEXT:    popl %esi
 ; I386-NOCMOV-NEXT:    popl %edi
+; I386-NOCMOV-NEXT:    popl %ebx
+; I386-NOCMOV-NEXT:    popl %ebp
 ; I386-NOCMOV-NEXT:    retl
 ;
 ; I386-CMOV-LABEL: test_ctselect_f80_alignment:
 ; I386-CMOV:       # %bb.0:
-; I386-CMOV-NEXT:    pushl %edi
-; I386-CMOV-NEXT:    pushl %esi
-; I386-CMOV-NEXT:    subl $12, %esp
+; I386-CMOV-NEXT:    subl $36, %esp
+; I386-CMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fldt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    fstpt {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; I386-CMOV-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT:    sete %al
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT:    movb %al, %ah
-; I386-CMOV-NEXT:    movzbl %ah, %edi
-; I386-CMOV-NEXT:    negl %edi
-; I386-CMOV-NEXT:    movl %edx, %esi
-; I386-CMOV-NEXT:    andl %edi, %esi
-; I386-CMOV-NEXT:    notl %edi
-; I386-CMOV-NEXT:    andl %ecx, %edi
-; I386-CMOV-NEXT:    orl %edi, %esi
-; I386-CMOV-NEXT:    movl %esi, (%esp)
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT:    movb %al, %ah
-; I386-CMOV-NEXT:    movzbl %ah, %edi
-; I386-CMOV-NEXT:    negl %edi
-; I386-CMOV-NEXT:    movl %edx, %esi
-; I386-CMOV-NEXT:    andl %edi, %esi
-; I386-CMOV-NEXT:    notl %edi
-; I386-CMOV-NEXT:    andl %ecx, %edi
-; I386-CMOV-NEXT:    orl %edi, %esi
-; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; I386-CMOV-NEXT:    movb %al, %ah
-; I386-CMOV-NEXT:    movzbl %ah, %edi
-; I386-CMOV-NEXT:    negl %edi
-; I386-CMOV-NEXT:    movl %edx, %esi
-; I386-CMOV-NEXT:    andl %edi, %esi
-; I386-CMOV-NEXT:    notl %edi
-; I386-CMOV-NEXT:    andl %ecx, %edi
-; I386-CMOV-NEXT:    orl %edi, %esi
-; I386-CMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; I386-CMOV-NEXT:    cmovnel {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movl %eax, (%esp)
 ; I386-CMOV-NEXT:    fldt (%esp)
-; I386-CMOV-NEXT:    addl $12, %esp
-; I386-CMOV-NEXT:    popl %esi
-; I386-CMOV-NEXT:    popl %edi
+; I386-CMOV-NEXT:    addl $36, %esp
 ; I386-CMOV-NEXT:    retl
   %result = call x86_fp80 @llvm.ct.select.f80(i1 %cond, x86_fp80 %a, x86_fp80 %b)
   ret x86_fp80 %result