[llvm] [X86] Use GFNI for LZCNT vXi8 ops (PR #141888)

Sat May 31 08:39:04 PDT 2025

https://github.com/houngkoungting updated https://github.com/llvm/llvm-project/pull/141888

>From 539e08816f2da3d75560701405c58e47823f8b17 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Wed, 28 May 2025 23:47:45 +0800
Subject: [PATCH 1/2] Add GFNI-based LZCNT lowering for vXi8

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 5340 ++++++++++++-----------
 llvm/test/CodeGen/X86/ctlz-gfni.ll      |   15 +
 llvm/test/CodeGen/X86/gfni-lzcnt.ll     |  635 ++-
 3 files changed, 2996 insertions(+), 2994 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/ctlz-gfni.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 99a82cab384aa..7918a8e72adf6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -170,14 +170,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
     static const struct {
       const RTLIB::Libcall Op;
-      const char * const Name;
+      const char *const Name;
       const CallingConv::ID CC;
     } LibraryCalls[] = {
-      { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
-      { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
-      { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
-      { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
-      { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
+        {RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall},
+        {RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall},
+        {RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall},
+        {RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall},
+        {RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall},
     };
 
     for (const auto &LC : LibraryCalls) {
@@ -210,10 +210,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // We don't accept any truncstore of integer registers.
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
-  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
-  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
-  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
+  setTruncStoreAction(MVT::i32, MVT::i8, Expand);
+  setTruncStoreAction(MVT::i16, MVT::i8, Expand);
 
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
@@ -225,106 +225,106 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   // Integer absolute.
   if (Subtarget.canUseCMOV()) {
-    setOperationAction(ISD::ABS            , MVT::i16  , Custom);
-    setOperationAction(ISD::ABS            , MVT::i32  , Custom);
+    setOperationAction(ISD::ABS, MVT::i16, Custom);
+    setOperationAction(ISD::ABS, MVT::i32, Custom);
     if (Subtarget.is64Bit())
-      setOperationAction(ISD::ABS          , MVT::i64  , Custom);
+      setOperationAction(ISD::ABS, MVT::i64, Custom);
   }
 
   // Absolute difference.
   for (auto Op : {ISD::ABDS, ISD::ABDU}) {
-    setOperationAction(Op                  , MVT::i8   , Custom);
-    setOperationAction(Op                  , MVT::i16  , Custom);
-    setOperationAction(Op                  , MVT::i32  , Custom);
+    setOperationAction(Op, MVT::i8, Custom);
+    setOperationAction(Op, MVT::i16, Custom);
+    setOperationAction(Op, MVT::i32, Custom);
     if (Subtarget.is64Bit())
-     setOperationAction(Op                 , MVT::i64  , Custom);
+      setOperationAction(Op, MVT::i64, Custom);
   }
 
   // Signed saturation subtraction.
-  setOperationAction(ISD::SSUBSAT          , MVT::i8   , Custom);
-  setOperationAction(ISD::SSUBSAT          , MVT::i16  , Custom);
-  setOperationAction(ISD::SSUBSAT          , MVT::i32  , Custom);
+  setOperationAction(ISD::SSUBSAT, MVT::i8, Custom);
+  setOperationAction(ISD::SSUBSAT, MVT::i16, Custom);
+  setOperationAction(ISD::SSUBSAT, MVT::i32, Custom);
   if (Subtarget.is64Bit())
-    setOperationAction(ISD::SSUBSAT        , MVT::i64  , Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::i64, Custom);
 
   // Funnel shifts.
   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
     // For slow shld targets we only lower for code size.
     LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
 
-    setOperationAction(ShiftOp             , MVT::i8   , Custom);
-    setOperationAction(ShiftOp             , MVT::i16  , Custom);
-    setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);
+    setOperationAction(ShiftOp, MVT::i8, Custom);
+    setOperationAction(ShiftOp, MVT::i16, Custom);
+    setOperationAction(ShiftOp, MVT::i32, ShiftDoubleAction);
     if (Subtarget.is64Bit())
-      setOperationAction(ShiftOp           , MVT::i64  , ShiftDoubleAction);
+      setOperationAction(ShiftOp, MVT::i64, ShiftDoubleAction);
   }
 
   if (!Subtarget.useSoftFloat()) {
     // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
     // operation.
-    setOperationAction(ISD::UINT_TO_FP,        MVT::i8, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
-    setOperationAction(ISD::UINT_TO_FP,        MVT::i16, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
     // We have an algorithm for SSE2, and we turn this into a 64-bit
     // FILD or VCVTUSI2SS/SD for other targets.
-    setOperationAction(ISD::UINT_TO_FP,        MVT::i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
     // We have an algorithm for SSE2->double, and we turn this into a
     // 64-bit FILD followed by conditional FADD for other targets.
-    setOperationAction(ISD::UINT_TO_FP,        MVT::i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
 
     // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
     // this operation.
-    setOperationAction(ISD::SINT_TO_FP,        MVT::i8, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
     // SSE has no i16 to fp conversion, only i32. We promote in the handler
     // to allow f80 to use i16 and f64 to use i16 with sse1 only
-    setOperationAction(ISD::SINT_TO_FP,        MVT::i16, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
     // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
-    setOperationAction(ISD::SINT_TO_FP,        MVT::i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
     // are Legal, f80 is custom lowered.
-    setOperationAction(ISD::SINT_TO_FP,        MVT::i64, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
 
     // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
     // this operation.
-    setOperationAction(ISD::FP_TO_SINT,        MVT::i8,  Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
     // FIXME: This doesn't generate invalid exception when it should. PR44019.
-    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8,  Promote);
-    setOperationAction(ISD::FP_TO_SINT,        MVT::i16, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
-    setOperationAction(ISD::FP_TO_SINT,        MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
     // are Legal, f80 is custom lowered.
-    setOperationAction(ISD::FP_TO_SINT,        MVT::i64, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
 
     // Handle FP_TO_UINT by promoting the destination to a larger signed
     // conversion.
-    setOperationAction(ISD::FP_TO_UINT,        MVT::i8,  Promote);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
     // FIXME: This doesn't generate invalid exception when it should. PR44019.
-    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8,  Promote);
-    setOperationAction(ISD::FP_TO_UINT,        MVT::i16, Promote);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
     // FIXME: This doesn't generate invalid exception when it should. PR44019.
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
-    setOperationAction(ISD::FP_TO_UINT,        MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT,        MVT::i64, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
 
-    setOperationAction(ISD::LRINT,             MVT::f32, Custom);
-    setOperationAction(ISD::LRINT,             MVT::f64, Custom);
-    setOperationAction(ISD::LLRINT,            MVT::f32, Custom);
-    setOperationAction(ISD::LLRINT,            MVT::f64, Custom);
+    setOperationAction(ISD::LRINT, MVT::f32, Custom);
+    setOperationAction(ISD::LRINT, MVT::f64, Custom);
+    setOperationAction(ISD::LLRINT, MVT::f32, Custom);
+    setOperationAction(ISD::LLRINT, MVT::f64, Custom);
 
     if (!Subtarget.is64Bit()) {
-      setOperationAction(ISD::LRINT,  MVT::i64, Custom);
+      setOperationAction(ISD::LRINT, MVT::i64, Custom);
       setOperationAction(ISD::LLRINT, MVT::i64, Custom);
     }
   }
@@ -332,7 +332,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   if (Subtarget.hasSSE2()) {
     // Custom lowering for saturating float to int conversions.
     // We handle promotion to larger result types manually.
-    for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
+    for (MVT VT : {MVT::i8, MVT::i16, MVT::i32}) {
       setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
       setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
     }
@@ -367,17 +367,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
   if (!Subtarget.hasSSE2()) {
-    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
-    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
+    setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+    setOperationAction(ISD::BITCAST, MVT::i32, Expand);
     setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
     if (Subtarget.is64Bit()) {
-      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
+      setOperationAction(ISD::BITCAST, MVT::f64, Expand);
       // Without SSE, i64->f64 goes through memory.
-      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
+      setOperationAction(ISD::BITCAST, MVT::i64, Expand);
     }
   } else if (!Subtarget.is64Bit())
-    setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
+    setOperationAction(ISD::BITCAST, MVT::i64, Custom);
 
   // Scalar integer divide and remainder are lowered to use operations that
   // produce two results, to match the available instructions. This exposes
@@ -389,7 +389,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // (low) operations are left as Legal, as there are single-result
   // instructions for this in x86. Using the two-result multiply instructions
   // when both high and low results are needed must be arranged by dagcombine.
-  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+  for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::SDIV, VT, Expand);
@@ -398,47 +398,47 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UREM, VT, Expand);
   }
 
-  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
-  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
-  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
-                   MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
-    setOperationAction(ISD::BR_CC,     VT, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+  for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128, MVT::i8, MVT::i16,
+                  MVT::i32, MVT::i64}) {
+    setOperationAction(ISD::BR_CC, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
   }
   if (Subtarget.is64Bit())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
-  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
-  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
-  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
-  setOperationAction(ISD::FREM             , MVT::f128 , Expand);
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+  setOperationAction(ISD::FREM, MVT::f80, Expand);
+  setOperationAction(ISD::FREM, MVT::f128, Expand);
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
-    setOperationAction(ISD::GET_ROUNDING   , MVT::i32  , Custom);
-    setOperationAction(ISD::SET_ROUNDING   , MVT::Other, Custom);
-    setOperationAction(ISD::GET_FPENV_MEM  , MVT::Other, Custom);
-    setOperationAction(ISD::SET_FPENV_MEM  , MVT::Other, Custom);
-    setOperationAction(ISD::RESET_FPENV    , MVT::Other, Custom);
+    setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
+    setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
+    setOperationAction(ISD::GET_FPENV_MEM, MVT::Other, Custom);
+    setOperationAction(ISD::SET_FPENV_MEM, MVT::Other, Custom);
+    setOperationAction(ISD::RESET_FPENV, MVT::Other, Custom);
   }
 
   // Promote the i8 variants and force them on up to i32 which has a shorter
   // encoding.
-  setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
-  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
+  setOperationPromotedToType(ISD::CTTZ, MVT::i8, MVT::i32);
+  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8, MVT::i32);
   // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
   // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
   // promote that too.
-  setOperationPromotedToType(ISD::CTTZ           , MVT::i16  , MVT::i32);
-  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , MVT::i32);
+  setOperationPromotedToType(ISD::CTTZ, MVT::i16, MVT::i32);
+  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16, MVT::i32);
 
   if (!Subtarget.hasBMI()) {
-    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
-    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
+    setOperationAction(ISD::CTTZ, MVT::i32, Custom);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
     if (Subtarget.is64Bit()) {
-      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
+      setOperationAction(ISD::CTTZ, MVT::i64, Custom);
       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
     }
   }
@@ -446,13 +446,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   if (Subtarget.hasLZCNT()) {
     // When promoting the i8 variants, force them to i32 for a shorter
     // encoding.
-    setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
-    setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
+    setOperationPromotedToType(ISD::CTLZ, MVT::i8, MVT::i32);
+    setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8, MVT::i32);
   } else {
     for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
       if (VT == MVT::i64 && !Subtarget.is64Bit())
         continue;
-      setOperationAction(ISD::CTLZ           , VT, Custom);
+      setOperationAction(ISD::CTLZ, VT, Custom);
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
     }
   }
@@ -497,36 +497,36 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // on the dest that popcntl hasn't had since Cannon Lake.
     setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
   } else {
-    setOperationAction(ISD::CTPOP          , MVT::i8   , Custom);
-    setOperationAction(ISD::CTPOP          , MVT::i16  , Custom);
-    setOperationAction(ISD::CTPOP          , MVT::i32  , Custom);
-    setOperationAction(ISD::CTPOP          , MVT::i64  , Custom);
+    setOperationAction(ISD::CTPOP, MVT::i8, Custom);
+    setOperationAction(ISD::CTPOP, MVT::i16, Custom);
+    setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+    setOperationAction(ISD::CTPOP, MVT::i64, Custom);
   }
 
-  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
 
   if (!Subtarget.hasMOVBE())
-    setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
+    setOperationAction(ISD::BSWAP, MVT::i16, Expand);
 
   // X86 wants to expand cmov itself.
-  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
+  for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
     setOperationAction(ISD::SELECT, VT, Custom);
     setOperationAction(ISD::SETCC, VT, Custom);
     setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
   }
-  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+  for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::SELECT, VT, Custom);
-    setOperationAction(ISD::SETCC,  VT, Custom);
+    setOperationAction(ISD::SETCC, VT, Custom);
   }
 
   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
 
-  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+  setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
   // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
@@ -536,19 +536,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 
   // Darwin ABI issue.
-  for (auto VT : { MVT::i32, MVT::i64 }) {
+  for (auto VT : {MVT::i32, MVT::i64}) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
-    setOperationAction(ISD::ConstantPool    , VT, Custom);
-    setOperationAction(ISD::JumpTable       , VT, Custom);
-    setOperationAction(ISD::GlobalAddress   , VT, Custom);
+    setOperationAction(ISD::ConstantPool, VT, Custom);
+    setOperationAction(ISD::JumpTable, VT, Custom);
+    setOperationAction(ISD::GlobalAddress, VT, Custom);
     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
-    setOperationAction(ISD::ExternalSymbol  , VT, Custom);
-    setOperationAction(ISD::BlockAddress    , VT, Custom);
+    setOperationAction(ISD::ExternalSymbol, VT, Custom);
+    setOperationAction(ISD::BlockAddress, VT, Custom);
   }
 
   // 64-bit shl, sra, srl (iff 32-bit x86)
-  for (auto VT : { MVT::i32, MVT::i64 }) {
+  for (auto VT : {MVT::i32, MVT::i64}) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::SHL_PARTS, VT, Custom);
@@ -557,12 +557,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   if (Subtarget.hasSSEPrefetch())
-    setOperationAction(ISD::PREFETCH      , MVT::Other, Custom);
+    setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
-  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
 
   // Expand certain atomics
-  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+  for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
@@ -606,14 +606,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
-  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
-  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
   bool Is64Bit = Subtarget.is64Bit();
-  setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
+  setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
 
-  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
-  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
 
   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
 
@@ -623,7 +623,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
 
-  auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
+  auto setF16Action = [&](MVT VT, LegalizeAction Action) {
     setOperationAction(ISD::FABS, VT, Action);
     setOperationAction(ISD::FNEG, VT, Action);
     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
@@ -678,7 +678,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // non-optsize case.
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
 
-    for (auto VT : { MVT::f32, MVT::f64 }) {
+    for (auto VT : {MVT::f32, MVT::f64}) {
       // Use ANDPD to simulate FABS.
       setOperationAction(ISD::FABS, VT, Custom);
 
@@ -693,8 +693,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FSUB, VT, Custom);
 
       // We don't support sin/cos/fmod
-      setOperationAction(ISD::FSIN   , VT, Expand);
-      setOperationAction(ISD::FCOS   , VT, Expand);
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
       setOperationAction(ISD::FSINCOS, VT, Expand);
     }
 
@@ -757,10 +757,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 
     // Use ANDPS to simulate FABS.
-    setOperationAction(ISD::FABS , MVT::f32, Custom);
+    setOperationAction(ISD::FABS, MVT::f32, Custom);
 
     // Use XORP to simulate FNEG.
-    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+    setOperationAction(ISD::FNEG, MVT::f32, Custom);
 
     if (UseX87)
       setOperationAction(ISD::UNDEF, MVT::f64, Expand);
@@ -771,8 +771,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 
     // We don't support sin/cos/fmod
-    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
-    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::f32, Expand);
     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
     if (UseX87) {
@@ -787,13 +787,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 
-    for (auto VT : { MVT::f32, MVT::f64 }) {
-      setOperationAction(ISD::UNDEF,     VT, Expand);
+    for (auto VT : {MVT::f32, MVT::f64}) {
+      setOperationAction(ISD::UNDEF, VT, Expand);
       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 
       // Always expand sin/cos functions even though x87 has an instruction.
-      setOperationAction(ISD::FSIN   , VT, Expand);
-      setOperationAction(ISD::FCOS   , VT, Expand);
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
       setOperationAction(ISD::FSINCOS, VT, Expand);
     }
   }
@@ -805,7 +805,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       addLegalFPImmediate(APFloat(+1.0f)); // FLD1
       addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
       addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
-    } else // SSE immediates.
+    } else                                 // SSE immediates.
       addLegalFPImmediate(APFloat(+0.0f)); // xorps
   }
   // Expand FP64 immediates into loads from the stack, save special cases.
@@ -815,7 +815,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       addLegalFPImmediate(APFloat(+1.0)); // FLD1
       addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
       addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
-    } else // SSE immediates.
+    } else                                // SSE immediates.
       addLegalFPImmediate(APFloat(+0.0)); // xorpd
   }
   // Support fp16 0 immediate.
@@ -823,18 +823,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
 
   // Handle constrained floating-point operations of scalar.
-  setOperationAction(ISD::STRICT_FADD,      MVT::f32, Legal);
-  setOperationAction(ISD::STRICT_FADD,      MVT::f64, Legal);
-  setOperationAction(ISD::STRICT_FSUB,      MVT::f32, Legal);
-  setOperationAction(ISD::STRICT_FSUB,      MVT::f64, Legal);
-  setOperationAction(ISD::STRICT_FMUL,      MVT::f32, Legal);
-  setOperationAction(ISD::STRICT_FMUL,      MVT::f64, Legal);
-  setOperationAction(ISD::STRICT_FDIV,      MVT::f32, Legal);
-  setOperationAction(ISD::STRICT_FDIV,      MVT::f64, Legal);
-  setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f32, Legal);
-  setOperationAction(ISD::STRICT_FP_ROUND,  MVT::f64, Legal);
-  setOperationAction(ISD::STRICT_FSQRT,     MVT::f32, Legal);
-  setOperationAction(ISD::STRICT_FSQRT,     MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
 
   // We don't support FMA.
   setOperationAction(ISD::FMA, MVT::f64, Expand);
@@ -843,21 +843,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // f80 always uses X87.
   if (UseX87) {
     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
-    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
+    setOperationAction(ISD::UNDEF, MVT::f80, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
     {
       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
-      addLegalFPImmediate(TmpFlt);  // FLD0
+      addLegalFPImmediate(TmpFlt); // FLD0
       TmpFlt.changeSign();
-      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
+      addLegalFPImmediate(TmpFlt); // FLD0/FCHS
 
       bool ignored;
       APFloat TmpFlt2(+1.0);
-      TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
-                      &ignored);
-      addLegalFPImmediate(TmpFlt2);  // FLD1
+      TmpFlt2.convert(APFloat::x87DoubleExtended(),
+                      APFloat::rmNearestTiesToEven, &ignored);
+      addLegalFPImmediate(TmpFlt2); // FLD1
       TmpFlt2.changeSign();
-      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
+      addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
     }
 
     // Always expand sin/cos functions even though x87 has an instruction.
@@ -876,9 +876,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // clang-format on
 
     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
-    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
+    setOperationAction(ISD::FCEIL, MVT::f80, Expand);
     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
-    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
+    setOperationAction(ISD::FRINT, MVT::f80, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
     setOperationAction(ISD::FROUNDEVEN, MVT::f80, Expand);
     setOperationAction(ISD::FMA, MVT::f80, Expand);
@@ -888,12 +888,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::LLRINT, MVT::f80, Custom);
 
     // Handle constrained floating-point operations of scalar.
-    setOperationAction(ISD::STRICT_FADD     , MVT::f80, Legal);
-    setOperationAction(ISD::STRICT_FSUB     , MVT::f80, Legal);
-    setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
-    setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
-    setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
-    setOperationAction(ISD::FCANONICALIZE   , MVT::f80, Custom);
+    setOperationAction(ISD::STRICT_FADD, MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::f80, Legal);
+    setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom);
     if (isTypeLegal(MVT::f16)) {
       setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
@@ -912,16 +912,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
 
-    setOperationAction(ISD::FADD,        MVT::f128, LibCall);
+    setOperationAction(ISD::FADD, MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
-    setOperationAction(ISD::FSUB,        MVT::f128, LibCall);
+    setOperationAction(ISD::FSUB, MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
-    setOperationAction(ISD::FDIV,        MVT::f128, LibCall);
+    setOperationAction(ISD::FDIV, MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
-    setOperationAction(ISD::FMUL,        MVT::f128, LibCall);
+    setOperationAction(ISD::FMUL, MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
-    setOperationAction(ISD::FMA,         MVT::f128, LibCall);
-    setOperationAction(ISD::STRICT_FMA,  MVT::f128, LibCall);
+    setOperationAction(ISD::FMA, MVT::f128, LibCall);
+    setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
 
     setOperationAction(ISD::FABS, MVT::f128, Custom);
     setOperationAction(ISD::FNEG, MVT::f128, Custom);
@@ -937,10 +937,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FTAN,  MVT::f128, LibCall);
     // clang-format on
     // No STRICT_FSINCOS
-    setOperationAction(ISD::FSQRT,        MVT::f128, LibCall);
+    setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
 
-    setOperationAction(ISD::FP_EXTEND,        MVT::f128, Custom);
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
     // We need to custom handle any FP_ROUND with an f128 input, but
     // LegalizeDAG uses the result type to know when to run a custom handler.
@@ -970,10 +970,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   // Always use a library call for pow.
-  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
-  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
-  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
-  setOperationAction(ISD::FPOW             , MVT::f128 , Expand);
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+  setOperationAction(ISD::FPOW, MVT::f80, Expand);
+  setOperationAction(ISD::FPOW, MVT::f128, Expand);
 
   setOperationAction(ISD::FLOG, MVT::f80, Expand);
   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
@@ -985,9 +985,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 
   // Some FP actions are always expanded for vector types.
-  for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
-                   MVT::v4f32, MVT::v8f32,  MVT::v16f32,
-                   MVT::v2f64, MVT::v4f64,  MVT::v8f64 }) {
+  for (auto VT : {MVT::v8f16, MVT::v16f16, MVT::v32f16, MVT::v4f32, MVT::v8f32,
+                  MVT::v16f32, MVT::v2f64, MVT::v4f64, MVT::v8f64}) {
     // clang-format off
     setOperationAction(ISD::FSIN,      VT, Expand);
     setOperationAction(ISD::FSINCOS,   VT, Expand);
@@ -1013,11 +1012,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UDIV, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
-    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
-    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
-    setOperationAction(ISD::FMA,  VT, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Expand);
+    setOperationAction(ISD::INSERT_SUBVECTOR, VT, Expand);
+    setOperationAction(ISD::FMA, VT, Expand);
     setOperationAction(ISD::FFLOOR, VT, Expand);
     setOperationAction(ISD::FCEIL, VT, Expand);
     setOperationAction(ISD::FTRUNC, VT, Expand);
@@ -1041,7 +1040,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
     setOperationAction(ISD::TRUNCATE, VT, Expand);
     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
@@ -1079,30 +1078,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
 
-    setOperationAction(ISD::FMAXIMUM,           MVT::f32, Custom);
-    setOperationAction(ISD::FMINIMUM,           MVT::f32, Custom);
-    setOperationAction(ISD::FMAXIMUMNUM,        MVT::f32, Custom);
-    setOperationAction(ISD::FMINIMUMNUM,        MVT::f32, Custom);
-
-    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
-    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
-    setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
+    setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom);
+    setOperationAction(ISD::FMINIMUM, MVT::f32, Custom);
+    setOperationAction(ISD::FMAXIMUMNUM, MVT::f32, Custom);
+    setOperationAction(ISD::FMINIMUMNUM, MVT::f32, Custom);
+
+    setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS, MVT::v4f32, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+    setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+    setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom);
 
-    setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
-    setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
+    setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
+    setOperationAction(ISD::STORE, MVT::v2f32, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Custom);
 
-    setOperationAction(ISD::STRICT_FADD,        MVT::v4f32, Legal);
-    setOperationAction(ISD::STRICT_FSUB,        MVT::v4f32, Legal);
-    setOperationAction(ISD::STRICT_FMUL,        MVT::v4f32, Legal);
-    setOperationAction(ISD::STRICT_FDIV,        MVT::v4f32, Legal);
-    setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -1122,74 +1121,74 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
 
-    for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
+    for (auto VT : {MVT::f64, MVT::v4f32, MVT::v2f64}) {
       setOperationAction(ISD::FMAXIMUM, VT, Custom);
       setOperationAction(ISD::FMINIMUM, VT, Custom);
       setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
       setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
     }
 
-    for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
-                     MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
+    for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16,
+                    MVT::v2i32}) {
       setOperationAction(ISD::SDIV, VT, Custom);
       setOperationAction(ISD::SREM, VT, Custom);
       setOperationAction(ISD::UDIV, VT, Custom);
       setOperationAction(ISD::UREM, VT, Custom);
     }
 
-    setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
-    setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);
-    setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);
-
-    setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
-    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
-    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
-    setOperationAction(ISD::MULHU,              MVT::v4i32, Custom);
-    setOperationAction(ISD::MULHS,              MVT::v4i32, Custom);
-    setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
-    setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
-    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
-    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
-    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
-    setOperationAction(ISD::AVGCEILU,           MVT::v16i8, Legal);
-    setOperationAction(ISD::AVGCEILU,           MVT::v8i16, Legal);
-
-    setOperationAction(ISD::SMULO,              MVT::v16i8, Custom);
-    setOperationAction(ISD::UMULO,              MVT::v16i8, Custom);
-    setOperationAction(ISD::UMULO,              MVT::v2i32, Custom);
-
-    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i8, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i8, Custom);
+    setOperationAction(ISD::MUL, MVT::v8i8, Custom);
+
+    setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+    setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
+    setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
+    setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
+    setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
+    setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
+    setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
+    setOperationAction(ISD::MUL, MVT::v8i16, Legal);
+    setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);
+    setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);
+
+    setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
+    setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
+    setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
+
+    setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Custom);
-    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
-    setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS, MVT::v2f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
 
     setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
     setOperationAction(ISD::LRINT, MVT::v2i32, Custom);
 
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
       setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
     }
 
-    setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::SSUBSAT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::UADDSAT,            MVT::v8i16, Legal);
-    setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);
-    setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);
-    setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);
-    setOperationAction(ISD::USUBSAT,            MVT::v4i32, Custom);
-    setOperationAction(ISD::USUBSAT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);
+    setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);
+    setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);
+    setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);
+    setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);
+    setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);
+    setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);
+    setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);
+    setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);
+    setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);
 
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
       setOperationAction(ISD::SETCC, VT, Custom);
       setOperationAction(ISD::CTPOP, VT, Custom);
       setOperationAction(ISD::ABS, VT, Custom);
@@ -1202,30 +1201,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
-    setOperationAction(ISD::SETCC,          MVT::v2f64, Custom);
-    setOperationAction(ISD::SETCC,          MVT::v4f32, Custom);
-    setOperationAction(ISD::STRICT_FSETCC,  MVT::v2f64, Custom);
-    setOperationAction(ISD::STRICT_FSETCC,  MVT::v4f32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
+    setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::v2f64, Custom);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::v4f32, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, MVT::v2f64, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f32, Custom);
 
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-      setOperationAction(ISD::VSELECT,            VT, Custom);
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) {
+      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::VSELECT, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
-    for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-      setOperationAction(ISD::VSELECT,            VT, Custom);
+    for (auto VT : {MVT::v8f16, MVT::v2f64, MVT::v2i64}) {
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::VSELECT, VT, Custom);
 
       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
         continue;
 
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
     setF16Action(MVT::v8f16, Expand);
@@ -1238,67 +1237,67 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Custom);
 
     // Custom lower v2i64 and v2f64 selects.
-    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v4i32, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v8i16, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v8f16, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
-
-    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);
-    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v2i32, Custom);
+    setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
+    setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
+    setOperationAction(ISD::SELECT, MVT::v4i32, Custom);
+    setOperationAction(ISD::SELECT, MVT::v8i16, Custom);
+    setOperationAction(ISD::SELECT, MVT::v8f16, Custom);
+    setOperationAction(ISD::SELECT, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
 
     // Custom legalize these to avoid over promotion or custom promotion.
     for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
-      setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
-      setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
+      setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+      setOperationAction(ISD::FP_TO_UINT, VT, Custom);
       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
     }
 
-    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Custom);
-    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v4i32, Custom);
-    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
-    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
 
-    setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
-    setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
 
-    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
-    setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v4i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
 
     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
-    setOperationAction(ISD::SINT_TO_FP,         MVT::v2f32, Custom);
-    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2f32, Custom);
-    setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
-    setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v2f32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
 
-    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
-    setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v2f32, Custom);
-    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
-    setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v2f32, Custom);
+    setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
+    setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
 
     // We want to legalize this to an f64 load rather than an i64 load on
     // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
     // store.
-    setOperationAction(ISD::LOAD,               MVT::v2i32, Custom);
-    setOperationAction(ISD::LOAD,               MVT::v4i16, Custom);
-    setOperationAction(ISD::LOAD,               MVT::v8i8,  Custom);
-    setOperationAction(ISD::STORE,              MVT::v2i32, Custom);
-    setOperationAction(ISD::STORE,              MVT::v4i16, Custom);
-    setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);
+    setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+    setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
+    setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
+    setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+    setOperationAction(ISD::STORE, MVT::v4i16, Custom);
+    setOperationAction(ISD::STORE, MVT::v8i8, Custom);
 
     // Add 32-bit vector stores to help vectorization opportunities.
-    setOperationAction(ISD::STORE,              MVT::v2i16, Custom);
-    setOperationAction(ISD::STORE,              MVT::v4i8,  Custom);
+    setOperationAction(ISD::STORE, MVT::v2i16, Custom);
+    setOperationAction(ISD::STORE, MVT::v4i8, Custom);
 
-    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
-    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
-    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
+    setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
+    setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
+    setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
     if (!Subtarget.hasAVX512())
       setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
 
@@ -1308,41 +1307,42 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
 
-    setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v2i64, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v4i32, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v4i64, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v8i16, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v8i32, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v8i64, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v16i8, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v16i32, Custom);
-    setOperationAction(ISD::TRUNCATE,    MVT::v16i64, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i64, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v4i64, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
 
     // In the customized shift lowering, the legal v4i32/v2i64 cases
     // in AVX2 will be recognized.
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
-      setOperationAction(ISD::SRL,              VT, Custom);
-      setOperationAction(ISD::SHL,              VT, Custom);
-      setOperationAction(ISD::SRA,              VT, Custom);
-      if (VT == MVT::v2i64) continue;
-      setOperationAction(ISD::ROTL,             VT, Custom);
-      setOperationAction(ISD::ROTR,             VT, Custom);
-      setOperationAction(ISD::FSHL,             VT, Custom);
-      setOperationAction(ISD::FSHR,             VT, Custom);
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+      if (VT == MVT::v2i64)
+        continue;
+      setOperationAction(ISD::ROTL, VT, Custom);
+      setOperationAction(ISD::ROTR, VT, Custom);
+      setOperationAction(ISD::FSHL, VT, Custom);
+      setOperationAction(ISD::FSHR, VT, Custom);
     }
 
-    setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);
-    setOperationAction(ISD::STRICT_FADD,        MVT::v2f64, Legal);
-    setOperationAction(ISD::STRICT_FSUB,        MVT::v2f64, Legal);
-    setOperationAction(ISD::STRICT_FMUL,        MVT::v2f64, Legal);
-    setOperationAction(ISD::STRICT_FDIV,        MVT::v2f64, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
   }
 
   if (Subtarget.hasGFNI()) {
@@ -1353,73 +1353,73 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
-    setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
-    setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
-    setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
+    setOperationAction(ISD::ABS, MVT::v16i8, Legal);
+    setOperationAction(ISD::ABS, MVT::v8i16, Legal);
+    setOperationAction(ISD::ABS, MVT::v4i32, Legal);
 
     for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
-      setOperationAction(ISD::BITREVERSE,       VT, Custom);
-      setOperationAction(ISD::CTLZ,             VT, Custom);
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
+      setOperationAction(ISD::CTLZ, VT, Custom);
     }
 
     // These might be better off as horizontal vector ops.
-    setOperationAction(ISD::ADD,                MVT::i16, Custom);
-    setOperationAction(ISD::ADD,                MVT::i32, Custom);
-    setOperationAction(ISD::SUB,                MVT::i16, Custom);
-    setOperationAction(ISD::SUB,                MVT::i32, Custom);
+    setOperationAction(ISD::ADD, MVT::i16, Custom);
+    setOperationAction(ISD::ADD, MVT::i32, Custom);
+    setOperationAction(ISD::SUB, MVT::i16, Custom);
+    setOperationAction(ISD::SUB, MVT::i32, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
-      setOperationAction(ISD::FFLOOR,            RoundedTy,  Legal);
-      setOperationAction(ISD::STRICT_FFLOOR,     RoundedTy,  Legal);
-      setOperationAction(ISD::FCEIL,             RoundedTy,  Legal);
-      setOperationAction(ISD::STRICT_FCEIL,      RoundedTy,  Legal);
-      setOperationAction(ISD::FTRUNC,            RoundedTy,  Legal);
-      setOperationAction(ISD::STRICT_FTRUNC,     RoundedTy,  Legal);
-      setOperationAction(ISD::FRINT,             RoundedTy,  Legal);
-      setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
-      setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
-      setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
-      setOperationAction(ISD::FROUNDEVEN,        RoundedTy,  Legal);
-      setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy,  Legal);
-
-      setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
-    }
-
-    setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
-    setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
-    setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
-    setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
-    setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
-    setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
-    setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
-    setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
-
-    setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
-    setOperationAction(ISD::SADDSAT,            MVT::v2i64, Custom);
-    setOperationAction(ISD::SSUBSAT,            MVT::v2i64, Custom);
+      setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
+      setOperationAction(ISD::FCEIL, RoundedTy, Legal);
+      setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
+      setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
+      setOperationAction(ISD::FRINT, RoundedTy, Legal);
+      setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
+      setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+      setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
+      setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);
+      setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);
+
+      setOperationAction(ISD::FROUND, RoundedTy, Custom);
+    }
+
+    setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
+    setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
+    setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
+    setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
+    setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
+    setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
+    setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
+    setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
+
+    setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);
+    setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);
 
     // FIXME: Do we need to handle scalar-to-vector here?
-    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
-    setOperationAction(ISD::SMULO,              MVT::v2i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Legal);
+    setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
 
     // We directly match byte blends in the backend as they match the VSELECT
     // condition form.
-    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
+    setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
 
     // SSE41 brings specific instructions for doing vector sign extend even in
     // cases where we don't have SRA.
-    for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+    for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
     }
 
     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
-    for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
-      setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
-      setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
-      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
+    for (auto LoadExtOp : {ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
+      setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
+      setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
+      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
@@ -1428,73 +1428,73 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
       // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
       // do the pre and post work in the vector domain.
-      setOperationAction(ISD::UINT_TO_FP,        MVT::v4i64, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
       // We need to mark SINT_TO_FP as Custom even though we want to expand it
       // so that DAG combine doesn't try to turn it into uint_to_fp.
-      setOperationAction(ISD::SINT_TO_FP,        MVT::v4i64, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
     }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
-    setOperationAction(ISD::UADDSAT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
-    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
-                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8,
+                    MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
       setOperationAction(ISD::ROTL, VT, Custom);
       setOperationAction(ISD::ROTR, VT, Custom);
     }
 
     // XOP can efficiently perform BITREVERSE with VPPERM.
-    for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
+    for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64})
       setOperationAction(ISD::BITREVERSE, VT, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
     bool HasInt256 = Subtarget.hasInt256();
 
-    addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
-                                                     : &X86::VR256RegClass);
+    addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                    : &X86::VR256RegClass);
     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
-    addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
-                                                     : &X86::VR256RegClass);
-    addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
-                                                     : &X86::VR256RegClass);
-    addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
-                                                     : &X86::VR256RegClass);
-    addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
-                                                     : &X86::VR256RegClass);
-
-    for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
-      setOperationAction(ISD::FFLOOR,            VT, Legal);
-      setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
-      setOperationAction(ISD::FCEIL,             VT, Legal);
-      setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
-      setOperationAction(ISD::FTRUNC,            VT, Legal);
-      setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
-      setOperationAction(ISD::FRINT,             VT, Legal);
-      setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
-      setOperationAction(ISD::FNEARBYINT,        VT, Legal);
+    addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                    : &X86::VR256RegClass);
+    addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                    : &X86::VR256RegClass);
+    addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                    : &X86::VR256RegClass);
+    addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                    : &X86::VR256RegClass);
+
+    for (auto VT : {MVT::v8f32, MVT::v4f64}) {
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
+      setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+      setOperationAction(ISD::FRINT, VT, Legal);
+      setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+      setOperationAction(ISD::FNEARBYINT, VT, Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
-      setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
 
-      setOperationAction(ISD::FROUND,            VT, Custom);
+      setOperationAction(ISD::FROUND, VT, Custom);
 
-      setOperationAction(ISD::FNEG,              VT, Custom);
-      setOperationAction(ISD::FABS,              VT, Custom);
-      setOperationAction(ISD::FCOPYSIGN,         VT, Custom);
+      setOperationAction(ISD::FNEG, VT, Custom);
+      setOperationAction(ISD::FABS, VT, Custom);
+      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
 
-      setOperationAction(ISD::FMAXIMUM,          VT, Custom);
-      setOperationAction(ISD::FMINIMUM,          VT, Custom);
-      setOperationAction(ISD::FMAXIMUMNUM,       VT, Custom);
-      setOperationAction(ISD::FMINIMUMNUM,       VT, Custom);
+      setOperationAction(ISD::FMAXIMUM, VT, Custom);
+      setOperationAction(ISD::FMINIMUM, VT, Custom);
+      setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
+      setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
       setOperationAction(ISD::FCANONICALIZE, VT, Custom);
     }
 
@@ -1503,81 +1503,82 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
     // even though v8i16 is a legal type.
-    setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i16, MVT::v8i32);
-    setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i16, MVT::v8i32);
+    setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
+    setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
-    setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT,                MVT::v8i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Custom);
-
-    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Custom);
-    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i32, Custom);
-    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Expand);
-    setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Expand);
-    setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);
-    setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Custom);
-
-    setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v4f32, Legal);
-    setOperationAction(ISD::STRICT_FADD,        MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FADD,        MVT::v4f64, Legal);
-    setOperationAction(ISD::STRICT_FSUB,        MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FSUB,        MVT::v4f64, Legal);
-    setOperationAction(ISD::STRICT_FMUL,        MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FMUL,        MVT::v4f64, Legal);
-    setOperationAction(ISD::STRICT_FDIV,        MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FDIV,        MVT::v4f64, Legal);
-    setOperationAction(ISD::STRICT_FSQRT,       MVT::v8f32, Legal);
-    setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f64, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);
+
+    setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);
+    setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);
+    setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);
+
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
 
     if (!Subtarget.hasAVX512())
       setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
 
     // In the customized shift lowering, the legal v8i32/v4i64 cases
     // in AVX2 will be recognized.
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
-      setOperationAction(ISD::SRL,             VT, Custom);
-      setOperationAction(ISD::SHL,             VT, Custom);
-      setOperationAction(ISD::SRA,             VT, Custom);
-      setOperationAction(ISD::ABDS,            VT, Custom);
-      setOperationAction(ISD::ABDU,            VT, Custom);
-      if (VT == MVT::v4i64) continue;
-      setOperationAction(ISD::ROTL,            VT, Custom);
-      setOperationAction(ISD::ROTR,            VT, Custom);
-      setOperationAction(ISD::FSHL,            VT, Custom);
-      setOperationAction(ISD::FSHR,            VT, Custom);
+    for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+      setOperationAction(ISD::ABDS, VT, Custom);
+      setOperationAction(ISD::ABDU, VT, Custom);
+      if (VT == MVT::v4i64)
+        continue;
+      setOperationAction(ISD::ROTL, VT, Custom);
+      setOperationAction(ISD::ROTR, VT, Custom);
+      setOperationAction(ISD::FSHL, VT, Custom);
+      setOperationAction(ISD::FSHR, VT, Custom);
     }
 
     // These types need custom splitting if their input is a 128-bit vector.
-    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
-    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);
-    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);
-
-    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
-    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
-    setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);
-    setOperationAction(ISD::SELECT,            MVT::v16i16, Custom);
-    setOperationAction(ISD::SELECT,            MVT::v16f16, Custom);
-    setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
-    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
-
-    for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
-      setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
-      setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
-      setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
-    }
-
-    setOperationAction(ISD::TRUNCATE,          MVT::v32i8, Custom);
-    setOperationAction(ISD::TRUNCATE,          MVT::v32i16, Custom);
-    setOperationAction(ISD::TRUNCATE,          MVT::v32i32, Custom);
-    setOperationAction(ISD::TRUNCATE,          MVT::v32i64, Custom);
-
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
-      setOperationAction(ISD::SETCC,           VT, Custom);
-      setOperationAction(ISD::CTPOP,           VT, Custom);
-      setOperationAction(ISD::CTLZ,            VT, Custom);
-      setOperationAction(ISD::BITREVERSE,      VT, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+
+    setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
+    setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
+    setOperationAction(ISD::SELECT, MVT::v8i32, Custom);
+    setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
+    setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
+    setOperationAction(ISD::SELECT, MVT::v32i8, Custom);
+    setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
+
+    for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
+      setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
+      setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
+      setOperationAction(ISD::ANY_EXTEND, VT, Custom);
+    }
+
+    setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v32i16, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v32i32, Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v32i64, Custom);
+
+    for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
+      setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::CTPOP, VT, Custom);
+      setOperationAction(ISD::CTLZ, VT, Custom);
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1585,64 +1586,64 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
-    setOperationAction(ISD::SETCC,          MVT::v4f64, Custom);
-    setOperationAction(ISD::SETCC,          MVT::v8f32, Custom);
-    setOperationAction(ISD::STRICT_FSETCC,  MVT::v4f64, Custom);
-    setOperationAction(ISD::STRICT_FSETCC,  MVT::v8f32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
+    setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::v4f64, Custom);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::v8f32, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f64, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f32, Custom);
 
     if (Subtarget.hasAnyFMA()) {
-      for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
-                       MVT::v2f64, MVT::v4f64 }) {
+      for (auto VT : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, MVT::v2f64,
+                      MVT::v4f64}) {
         setOperationAction(ISD::FMA, VT, Legal);
         setOperationAction(ISD::STRICT_FMA, VT, Legal);
       }
     }
 
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+    for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
     }
 
-    setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
-    setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
-
-    setOperationAction(ISD::MULHU,     MVT::v8i32,  Custom);
-    setOperationAction(ISD::MULHS,     MVT::v8i32,  Custom);
-    setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
-    setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
-    setOperationAction(ISD::AVGCEILU,  MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::AVGCEILU,  MVT::v32i8,  HasInt256 ? Legal : Custom);
-
-    setOperationAction(ISD::SMULO,     MVT::v32i8, Custom);
-    setOperationAction(ISD::UMULO,     MVT::v32i8, Custom);
-
-    setOperationAction(ISD::ABS,       MVT::v4i64,  Custom);
-    setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);
-    setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);
-    setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
-    setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
-
-    setOperationAction(ISD::UADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::SADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::USUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::SSUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::UADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
-    setOperationAction(ISD::UADDSAT,   MVT::v8i32, Custom);
-    setOperationAction(ISD::USUBSAT,   MVT::v8i32, Custom);
-    setOperationAction(ISD::UADDSAT,   MVT::v4i64, Custom);
-    setOperationAction(ISD::USUBSAT,   MVT::v4i64, Custom);
-
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
-      setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MUL, MVT::v4i64, Custom);
+    setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MUL, MVT::v32i8, Custom);
+
+    setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
+    setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
+    setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
+    setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
+    setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
+
+    setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
+    setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
+
+    setOperationAction(ISD::ABS, MVT::v4i64, Custom);
+    setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
+    setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
+    setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
+    setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
+
+    setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
+    setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);
+    setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);
+    setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);
+    setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);
+
+    for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32}) {
+      setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
@@ -1661,41 +1662,41 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
 
       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
-      for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
+      for (auto LoadExtOp : {ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
-        setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
-        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
-        setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
-        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
-        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
       }
     }
 
-    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
-                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
-      setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
+    for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32,
+                    MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
+      setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
       setOperationAction(ISD::MSTORE, VT, Legal);
     }
 
     // Extract subvector is special because the value type
     // (result) is 128-bit but the source is 256-bit wide.
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
-                     MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v8f16,
+                    MVT::v4f32, MVT::v2f64}) {
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
     }
 
     // Custom lower several nodes for 256-bit types.
-    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
-                    MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-      setOperationAction(ISD::VSELECT,            VT, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+    for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, MVT::v16f16,
+                   MVT::v8f32, MVT::v4f64}) {
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::VSELECT, VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
-      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
-      setOperationAction(ISD::STORE,              VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::STORE, VT, Custom);
     }
     setF16Action(MVT::v16f16, Expand);
     setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
@@ -1713,21 +1714,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
       setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
 
-      for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
-                       MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
-        setOperationAction(ISD::MGATHER,  VT, Custom);
+      for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64})
+        setOperationAction(ISD::MGATHER, VT, Custom);
     }
   }
 
   if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
       Subtarget.hasF16C()) {
-    for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
-      setOperationAction(ISD::FP_ROUND,           VT, Custom);
-      setOperationAction(ISD::STRICT_FP_ROUND,    VT, Custom);
+    for (MVT VT : {MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16}) {
+      setOperationAction(ISD::FP_ROUND, VT, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
     }
-    for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
-      setOperationAction(ISD::FP_EXTEND,          VT, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,   VT, Custom);
+    for (MVT VT : {MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32}) {
+      setOperationAction(ISD::FP_EXTEND, VT, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);
     }
     for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
       setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
@@ -1741,28 +1742,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // available with AVX512. 512-bit vectors are in a separate block controlled
   // by useAVX512Regs.
   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
-    addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
-    addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
-    addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
-    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
-    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
+    addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
+    addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
+    addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
+    addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
+    addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
 
-    setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
+    setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
-
-    setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v8i1,  MVT::v8i32);
-    setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i1,  MVT::v8i32);
-    setOperationPromotedToType(ISD::FP_TO_SINT,        MVT::v4i1,  MVT::v4i32);
-    setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v4i1,  MVT::v4i32);
-    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
-    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
-    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
-    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
-    setOperationAction(ISD::FP_TO_SINT,                MVT::v2i1,  Custom);
-    setOperationAction(ISD::FP_TO_UINT,                MVT::v2i1,  Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v2i1,  Custom);
-    setOperationAction(ISD::STRICT_FP_TO_UINT,         MVT::v2i1,  Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
+
+    setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+    setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+    setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+    setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+    setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+    setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Custom);
@@ -1781,29 +1782,29 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
 
     // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
       setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
       setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
-      setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
+      setOperationAction(ISD::ANY_EXTEND, VT, Custom);
     }
 
-    for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
-      setOperationAction(ISD::VSELECT,          VT, Expand);
+    for (auto VT : {MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1})
+      setOperationAction(ISD::VSELECT, VT, Expand);
 
-    for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
-      setOperationAction(ISD::SETCC,            VT, Custom);
-      setOperationAction(ISD::SELECT,           VT, Custom);
-      setOperationAction(ISD::TRUNCATE,         VT, Custom);
+    for (auto VT : {MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1}) {
+      setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::SELECT, VT, Custom);
+      setOperationAction(ISD::TRUNCATE, VT, Custom);
 
-      setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS,   VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
     }
 
-    for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
+    for (auto VT : {MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1})
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   }
   if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
@@ -1821,30 +1822,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
-    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
-    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
+    addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
+    addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
     addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
-    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
+    addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
 
     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
-      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
+      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
-      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
-      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
-      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
+      setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
+      setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
+      setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
       if (HasBWI)
         setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
     }
 
-    for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
+    for (MVT VT : {MVT::v16f32, MVT::v8f64}) {
       setOperationAction(ISD::FMAXIMUM, VT, Custom);
       setOperationAction(ISD::FMINIMUM, VT, Custom);
       setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
       setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
-      setOperationAction(ISD::FNEG,  VT, Custom);
-      setOperationAction(ISD::FABS,  VT, Custom);
-      setOperationAction(ISD::FMA,   VT, Legal);
+      setOperationAction(ISD::FNEG, VT, Custom);
+      setOperationAction(ISD::FABS, VT, Custom);
+      setOperationAction(ISD::FMA, VT, Legal);
       setOperationAction(ISD::STRICT_FMA, VT, Legal);
       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
       setOperationAction(ISD::FCANONICALIZE, VT, Custom);
@@ -1856,93 +1857,93 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (Subtarget.hasDQI())
       setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
 
-    for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
-      setOperationPromotedToType(ISD::FP_TO_SINT       , VT, MVT::v16i32);
-      setOperationPromotedToType(ISD::FP_TO_UINT       , VT, MVT::v16i32);
+    for (MVT VT : {MVT::v16i1, MVT::v16i8}) {
+      setOperationPromotedToType(ISD::FP_TO_SINT, VT, MVT::v16i32);
+      setOperationPromotedToType(ISD::FP_TO_UINT, VT, MVT::v16i32);
       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
     }
 
-    for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
-      setOperationAction(ISD::FP_TO_SINT,        VT, Custom);
-      setOperationAction(ISD::FP_TO_UINT,        VT, Custom);
+    for (MVT VT : {MVT::v16i16, MVT::v16i32}) {
+      setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+      setOperationAction(ISD::FP_TO_UINT, VT, Custom);
       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
     }
 
-    setOperationAction(ISD::SINT_TO_FP,        MVT::v16i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP,        MVT::v16i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);
     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
-    setOperationAction(ISD::FP_EXTEND,         MVT::v8f64,  Custom);
-    setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v8f64,  Custom);
-
-    setOperationAction(ISD::STRICT_FADD,      MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FADD,      MVT::v8f64,  Legal);
-    setOperationAction(ISD::STRICT_FSUB,      MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FSUB,      MVT::v8f64,  Legal);
-    setOperationAction(ISD::STRICT_FMUL,      MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FMUL,      MVT::v8f64,  Legal);
-    setOperationAction(ISD::STRICT_FDIV,      MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FDIV,      MVT::v8f64,  Legal);
-    setOperationAction(ISD::STRICT_FSQRT,     MVT::v16f32, Legal);
-    setOperationAction(ISD::STRICT_FSQRT,     MVT::v8f64,  Legal);
-    setOperationAction(ISD::STRICT_FP_ROUND,  MVT::v8f32,  Legal);
-
-    setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
-    setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
-    setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
-    setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
-    setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
+    setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);
+
+    setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
+    setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
+    setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
+    setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
+    setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
+
+    setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
+    setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
+    setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
+    setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
+    setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
     if (HasBWI)
-      setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
+      setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
 
     // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
     // to 512-bit rather than use the AVX2 instructions so that we can use
     // k-masks.
     if (!Subtarget.hasVLX()) {
       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
-           MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
-        setOperationAction(ISD::MLOAD,  VT, Custom);
+                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
+        setOperationAction(ISD::MLOAD, VT, Custom);
         setOperationAction(ISD::MSTORE, VT, Custom);
       }
     }
 
-    setOperationAction(ISD::TRUNCATE,    MVT::v8i32,  Legal);
-    setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Legal);
-    setOperationAction(ISD::TRUNCATE,    MVT::v32i8,  HasBWI ? Legal : Custom);
+    setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
+    setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
+    setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
     setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
-    setOperationAction(ISD::ANY_EXTEND,  MVT::v32i16, Custom);
-    setOperationAction(ISD::ANY_EXTEND,  MVT::v16i32, Custom);
-    setOperationAction(ISD::ANY_EXTEND,  MVT::v8i64,  Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
     setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
 
     if (HasBWI) {
       // Extends from v64i1 masks to 512-bit vectors.
-      setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
-      setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
-      setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
-    }
-
-    for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
-      setOperationAction(ISD::FFLOOR,            VT, Legal);
-      setOperationAction(ISD::STRICT_FFLOOR,     VT, Legal);
-      setOperationAction(ISD::FCEIL,             VT, Legal);
-      setOperationAction(ISD::STRICT_FCEIL,      VT, Legal);
-      setOperationAction(ISD::FTRUNC,            VT, Legal);
-      setOperationAction(ISD::STRICT_FTRUNC,     VT, Legal);
-      setOperationAction(ISD::FRINT,             VT, Legal);
-      setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
-      setOperationAction(ISD::FNEARBYINT,        VT, Legal);
+      setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
+      setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
+      setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
+    }
+
+    for (auto VT : {MVT::v16f32, MVT::v8f64}) {
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
+      setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+      setOperationAction(ISD::FRINT, VT, Legal);
+      setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+      setOperationAction(ISD::FNEARBYINT, VT, Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
-      setOperationAction(ISD::FROUNDEVEN,        VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
 
-      setOperationAction(ISD::FROUND,            VT, Custom);
+      setOperationAction(ISD::FROUND, VT, Custom);
     }
 
     for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
@@ -1952,36 +1953,36 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
     setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
-    setOperationAction(ISD::ADD, MVT::v64i8,  HasBWI ? Legal : Custom);
-    setOperationAction(ISD::SUB, MVT::v64i8,  HasBWI ? Legal : Custom);
+    setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
+    setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
 
-    setOperationAction(ISD::MUL, MVT::v8i64,  Custom);
+    setOperationAction(ISD::MUL, MVT::v8i64, Custom);
     setOperationAction(ISD::MUL, MVT::v16i32, Legal);
     setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
-    setOperationAction(ISD::MUL, MVT::v64i8,  Custom);
+    setOperationAction(ISD::MUL, MVT::v64i8, Custom);
 
     setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
     setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
     setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
     setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
-    setOperationAction(ISD::MULHS, MVT::v64i8,  Custom);
-    setOperationAction(ISD::MULHU, MVT::v64i8,  Custom);
+    setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
+    setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
     setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
-    setOperationAction(ISD::AVGCEILU, MVT::v64i8,  HasBWI ? Legal : Custom);
+    setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
 
     setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
     setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
 
-    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
-      setOperationAction(ISD::SRL,              VT, Custom);
-      setOperationAction(ISD::SHL,              VT, Custom);
-      setOperationAction(ISD::SRA,              VT, Custom);
-      setOperationAction(ISD::ROTL,             VT, Custom);
-      setOperationAction(ISD::ROTR,             VT, Custom);
-      setOperationAction(ISD::SETCC,            VT, Custom);
-      setOperationAction(ISD::ABDS,             VT, Custom);
-      setOperationAction(ISD::ABDU,             VT, Custom);
-      setOperationAction(ISD::BITREVERSE,       VT, Custom);
+    for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+      setOperationAction(ISD::ROTL, VT, Custom);
+      setOperationAction(ISD::ROTR, VT, Custom);
+      setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::ABDS, VT, Custom);
+      setOperationAction(ISD::ABDU, VT, Custom);
+      setOperationAction(ISD::BITREVERSE, VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1989,82 +1990,83 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
-    setOperationAction(ISD::SETCC,          MVT::v8f64, Custom);
-    setOperationAction(ISD::SETCC,          MVT::v16f32, Custom);
-    setOperationAction(ISD::STRICT_FSETCC,  MVT::v8f64, Custom);
-    setOperationAction(ISD::STRICT_FSETCC,  MVT::v16f32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
+    setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::v8f64, Custom);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::v16f32, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, MVT::v8f64, Custom);
     setOperationAction(ISD::STRICT_FSETCCS, MVT::v16f32, Custom);
 
-    for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
-      setOperationAction(ISD::SMAX,             VT, Legal);
-      setOperationAction(ISD::UMAX,             VT, Legal);
-      setOperationAction(ISD::SMIN,             VT, Legal);
-      setOperationAction(ISD::UMIN,             VT, Legal);
-      setOperationAction(ISD::ABS,              VT, Legal);
-      setOperationAction(ISD::CTPOP,            VT, Custom);
-    }
-
-    for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
-      setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
-      setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
-      setOperationAction(ISD::CTLZ,    VT, Custom);
-      setOperationAction(ISD::SMAX,    VT, HasBWI ? Legal : Custom);
-      setOperationAction(ISD::UMAX,    VT, HasBWI ? Legal : Custom);
-      setOperationAction(ISD::SMIN,    VT, HasBWI ? Legal : Custom);
-      setOperationAction(ISD::UMIN,    VT, HasBWI ? Legal : Custom);
+    for (auto VT : {MVT::v16i32, MVT::v8i64}) {
+      setOperationAction(ISD::SMAX, VT, Legal);
+      setOperationAction(ISD::UMAX, VT, Legal);
+      setOperationAction(ISD::SMIN, VT, Legal);
+      setOperationAction(ISD::UMIN, VT, Legal);
+      setOperationAction(ISD::ABS, VT, Legal);
+      setOperationAction(ISD::CTPOP, VT, Custom);
+    }
+
+    for (auto VT : {MVT::v64i8, MVT::v32i16}) {
+      setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::CTPOP, VT,
+                         Subtarget.hasBITALG() ? Legal : Custom);
+      setOperationAction(ISD::CTLZ, VT, Custom);
+      setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
       setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
       setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
       setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
       setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
     }
 
-    setOperationAction(ISD::FSHL,       MVT::v64i8, Custom);
-    setOperationAction(ISD::FSHR,       MVT::v64i8, Custom);
-    setOperationAction(ISD::FSHL,      MVT::v32i16, Custom);
-    setOperationAction(ISD::FSHR,      MVT::v32i16, Custom);
-    setOperationAction(ISD::FSHL,      MVT::v16i32, Custom);
-    setOperationAction(ISD::FSHR,      MVT::v16i32, Custom);
+    setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
+    setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
+    setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
+    setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
+    setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
+    setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
 
     if (Subtarget.hasDQI() || Subtarget.hasFP16())
       for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
                        ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
                        ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
-        setOperationAction(Opc,           MVT::v8i64, Custom);
+        setOperationAction(Opc, MVT::v8i64, Custom);
 
     if (Subtarget.hasDQI())
-      setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
+      setOperationAction(ISD::MUL, MVT::v8i64, Legal);
 
     if (Subtarget.hasCDI()) {
       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
-      for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
-        setOperationAction(ISD::CTLZ,            VT, Legal);
+      for (auto VT : {MVT::v16i32, MVT::v8i64}) {
+        setOperationAction(ISD::CTLZ, VT, Legal);
       }
     } // Subtarget.hasCDI()
 
     if (Subtarget.hasVPOPCNTDQ()) {
-      for (auto VT : { MVT::v16i32, MVT::v8i64 })
+      for (auto VT : {MVT::v16i32, MVT::v8i64})
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
 
     // Extract subvector is special because the value type
     // (result) is 256-bit but the source is 512-bit wide.
     // 128-bit was made Legal under AVX1.
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
-                     MVT::v16f16, MVT::v8f32, MVT::v4f64 })
+    for (auto VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
+                    MVT::v16f16, MVT::v8f32, MVT::v4f64})
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
 
-    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
-                     MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
-      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
-      setOperationAction(ISD::SELECT,             VT, Custom);
-      setOperationAction(ISD::VSELECT,            VT, Custom);
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+    for (auto VT : {MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
+                    MVT::v32f16, MVT::v16f32, MVT::v8f64}) {
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+      setOperationAction(ISD::SELECT, VT, Custom);
+      setOperationAction(ISD::VSELECT, VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
     }
     setF16Action(MVT::v32f16, Expand);
     setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
@@ -2075,20 +2077,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
     setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
 
-    for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
-      setOperationAction(ISD::MLOAD,               VT, Legal);
-      setOperationAction(ISD::MSTORE,              VT, Legal);
-      setOperationAction(ISD::MGATHER,             VT, Custom);
-      setOperationAction(ISD::MSCATTER,            VT, Custom);
+    for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64}) {
+      setOperationAction(ISD::MLOAD, VT, Legal);
+      setOperationAction(ISD::MSTORE, VT, Legal);
+      setOperationAction(ISD::MGATHER, VT, Custom);
+      setOperationAction(ISD::MSCATTER, VT, Custom);
     }
     if (HasBWI) {
-      for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
-        setOperationAction(ISD::MLOAD,        VT, Legal);
-        setOperationAction(ISD::MSTORE,       VT, Legal);
+      for (auto VT : {MVT::v64i8, MVT::v32i16}) {
+        setOperationAction(ISD::MLOAD, VT, Legal);
+        setOperationAction(ISD::MSTORE, VT, Legal);
       }
     } else {
       setOperationAction(ISD::STORE, MVT::v32i16, Custom);
-      setOperationAction(ISD::STORE, MVT::v64i8,  Custom);
+      setOperationAction(ISD::STORE, MVT::v64i8, Custom);
     }
 
     if (Subtarget.hasVBMI2()) {
@@ -2104,7 +2106,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
     setOperationAction(ISD::FABS, MVT::v32f16, Custom);
     setOperationAction(ISD::FCOPYSIGN, MVT::v32f16, Custom);
-  }// useAVX512Regs
+  } // useAVX512Regs
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
     for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
@@ -2121,9 +2123,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // These operations are handled on non-VLX by artificially widening in
     // isel patterns.
 
-    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v4i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v2i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
 
     if (Subtarget.hasDQI()) {
       // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
@@ -2132,31 +2134,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
              isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
              "Unexpected operation action!");
       // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
-      setOperationAction(ISD::FP_TO_SINT,        MVT::v2f32, Custom);
-      setOperationAction(ISD::FP_TO_UINT,        MVT::v2f32, Custom);
+      setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
     }
 
-    for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
+    for (auto VT : {MVT::v2i64, MVT::v4i64}) {
       setOperationAction(ISD::SMAX, VT, Legal);
       setOperationAction(ISD::UMAX, VT, Legal);
       setOperationAction(ISD::SMIN, VT, Legal);
       setOperationAction(ISD::UMIN, VT, Legal);
-      setOperationAction(ISD::ABS,  VT, Legal);
+      setOperationAction(ISD::ABS, VT, Legal);
     }
 
-    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
-      setOperationAction(ISD::ROTL,     VT, Custom);
-      setOperationAction(ISD::ROTR,     VT, Custom);
+    for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64}) {
+      setOperationAction(ISD::ROTL, VT, Custom);
+      setOperationAction(ISD::ROTR, VT, Custom);
     }
 
     // Custom legalize 2x32 to get a little better code.
     setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
     setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
 
-    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
-                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
+    for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32,
+                    MVT::v8f32, MVT::v2f64, MVT::v4f64})
       setOperationAction(ISD::MSCATTER, VT, Custom);
 
     if (Subtarget.hasDQI()) {
@@ -2171,13 +2173,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
 
     if (Subtarget.hasCDI()) {
-      for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
-        setOperationAction(ISD::CTLZ,            VT, Legal);
+      for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64}) {
+        setOperationAction(ISD::CTLZ, VT, Legal);
       }
     } // Subtarget.hasCDI()
 
     if (Subtarget.hasVPOPCNTDQ()) {
-      for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
+      for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64})
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
 
@@ -2214,32 +2216,32 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // This block control legalization of v32i1/v64i1 which are available with
   // AVX512BW..
   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
-    addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
-    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
+    addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
+    addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
 
-    for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
-      setOperationAction(ISD::VSELECT,            VT, Expand);
-      setOperationAction(ISD::TRUNCATE,           VT, Custom);
-      setOperationAction(ISD::SETCC,              VT, Custom);
+    for (auto VT : {MVT::v32i1, MVT::v64i1}) {
+      setOperationAction(ISD::VSELECT, VT, Expand);
+      setOperationAction(ISD::TRUNCATE, VT, Custom);
+      setOperationAction(ISD::SETCC, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
-      setOperationAction(ISD::SELECT,             VT, Custom);
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::SELECT, VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
     }
 
-    for (auto VT : { MVT::v16i1, MVT::v32i1 })
+    for (auto VT : {MVT::v16i1, MVT::v32i1})
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 
     // Extends from v32i1 masks to 256-bit vectors.
-    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
-    setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
 
-    for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
-      setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
+    for (auto VT : {MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16}) {
+      setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
     }
 
@@ -2248,119 +2250,119 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
 
     if (Subtarget.hasBITALG()) {
-      for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
+      for (auto VT : {MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16})
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
-    auto setGroup = [&] (MVT VT) {
-      setOperationAction(ISD::FADD,               VT, Legal);
-      setOperationAction(ISD::STRICT_FADD,        VT, Legal);
-      setOperationAction(ISD::FSUB,               VT, Legal);
-      setOperationAction(ISD::STRICT_FSUB,        VT, Legal);
-      setOperationAction(ISD::FMUL,               VT, Legal);
-      setOperationAction(ISD::STRICT_FMUL,        VT, Legal);
-      setOperationAction(ISD::FDIV,               VT, Legal);
-      setOperationAction(ISD::STRICT_FDIV,        VT, Legal);
-      setOperationAction(ISD::FSQRT,              VT, Legal);
-      setOperationAction(ISD::STRICT_FSQRT,       VT, Legal);
-
-      setOperationAction(ISD::FFLOOR,             VT, Legal);
-      setOperationAction(ISD::STRICT_FFLOOR,      VT, Legal);
-      setOperationAction(ISD::FCEIL,              VT, Legal);
-      setOperationAction(ISD::STRICT_FCEIL,       VT, Legal);
-      setOperationAction(ISD::FTRUNC,             VT, Legal);
-      setOperationAction(ISD::STRICT_FTRUNC,      VT, Legal);
-      setOperationAction(ISD::FRINT,              VT, Legal);
-      setOperationAction(ISD::STRICT_FRINT,       VT, Legal);
-      setOperationAction(ISD::FNEARBYINT,         VT, Legal);
-      setOperationAction(ISD::STRICT_FNEARBYINT,  VT, Legal);
+    auto setGroup = [&](MVT VT) {
+      setOperationAction(ISD::FADD, VT, Legal);
+      setOperationAction(ISD::STRICT_FADD, VT, Legal);
+      setOperationAction(ISD::FSUB, VT, Legal);
+      setOperationAction(ISD::STRICT_FSUB, VT, Legal);
+      setOperationAction(ISD::FMUL, VT, Legal);
+      setOperationAction(ISD::STRICT_FMUL, VT, Legal);
+      setOperationAction(ISD::FDIV, VT, Legal);
+      setOperationAction(ISD::STRICT_FDIV, VT, Legal);
+      setOperationAction(ISD::FSQRT, VT, Legal);
+      setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
+
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
+      setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+      setOperationAction(ISD::FRINT, VT, Legal);
+      setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+      setOperationAction(ISD::FNEARBYINT, VT, Legal);
+      setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
       setOperationAction(ISD::FROUNDEVEN, VT, Legal);
       setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);
 
-      setOperationAction(ISD::FROUND,             VT, Custom);
+      setOperationAction(ISD::FROUND, VT, Custom);
 
-      setOperationAction(ISD::LOAD,               VT, Legal);
-      setOperationAction(ISD::STORE,              VT, Legal);
+      setOperationAction(ISD::LOAD, VT, Legal);
+      setOperationAction(ISD::STORE, VT, Legal);
 
-      setOperationAction(ISD::FMA,                VT, Legal);
-      setOperationAction(ISD::STRICT_FMA,         VT, Legal);
-      setOperationAction(ISD::VSELECT,            VT, Legal);
-      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
-      setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::FMA, VT, Legal);
+      setOperationAction(ISD::STRICT_FMA, VT, Legal);
+      setOperationAction(ISD::VSELECT, VT, Legal);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+      setOperationAction(ISD::SELECT, VT, Custom);
 
-      setOperationAction(ISD::FNEG,               VT, Custom);
-      setOperationAction(ISD::FABS,               VT, Custom);
-      setOperationAction(ISD::FCOPYSIGN,          VT, Custom);
+      setOperationAction(ISD::FNEG, VT, Custom);
+      setOperationAction(ISD::FABS, VT, Custom);
+      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 
-      setOperationAction(ISD::SETCC,              VT, Custom);
-      setOperationAction(ISD::STRICT_FSETCC,      VT, Custom);
-      setOperationAction(ISD::STRICT_FSETCCS,     VT, Custom);
+      setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+      setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
     };
 
     // AVX512_FP16 scalar operations
     setGroup(MVT::f16);
-    setOperationAction(ISD::FREM,                 MVT::f16, Promote);
-    setOperationAction(ISD::STRICT_FREM,          MVT::f16, Promote);
-    setOperationAction(ISD::SELECT_CC,            MVT::f16, Expand);
-    setOperationAction(ISD::BR_CC,                MVT::f16, Expand);
-    setOperationAction(ISD::STRICT_FROUND,        MVT::f16, Promote);
-    setOperationAction(ISD::FROUNDEVEN,           MVT::f16, Legal);
-    setOperationAction(ISD::STRICT_FROUNDEVEN,    MVT::f16, Legal);
-    setOperationAction(ISD::FP_ROUND,             MVT::f16, Custom);
-    setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);
-    setOperationAction(ISD::FMAXIMUM,             MVT::f16, Custom);
-    setOperationAction(ISD::FMINIMUM,             MVT::f16, Custom);
-    setOperationAction(ISD::FMAXIMUMNUM,          MVT::f16, Custom);
-    setOperationAction(ISD::FMINIMUMNUM,          MVT::f16, Custom);
-    setOperationAction(ISD::FP_EXTEND,            MVT::f32, Legal);
-    setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);
-    setOperationAction(ISD::LRINT,                MVT::f16, Legal);
-    setOperationAction(ISD::LLRINT,               MVT::f16, Legal);
+    setOperationAction(ISD::FREM, MVT::f16, Promote);
+    setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);
+    setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+    setOperationAction(ISD::BR_CC, MVT::f16, Expand);
+    setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);
+    setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);
+    setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMAXIMUMNUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMINIMUMNUM, MVT::f16, Custom);
+    setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
+    setOperationAction(ISD::LRINT, MVT::f16, Legal);
+    setOperationAction(ISD::LLRINT, MVT::f16, Legal);
 
     setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
     setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
 
     if (Subtarget.useAVX512Regs()) {
       setGroup(MVT::v32f16);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,       MVT::v32f16, Custom);
-      setOperationAction(ISD::SINT_TO_FP,             MVT::v32i16, Legal);
-      setOperationAction(ISD::STRICT_SINT_TO_FP,      MVT::v32i16, Legal);
-      setOperationAction(ISD::UINT_TO_FP,             MVT::v32i16, Legal);
-      setOperationAction(ISD::STRICT_UINT_TO_FP,      MVT::v32i16, Legal);
-      setOperationAction(ISD::FP_ROUND,               MVT::v16f16, Legal);
-      setOperationAction(ISD::STRICT_FP_ROUND,        MVT::v16f16, Legal);
-      setOperationAction(ISD::FP_EXTEND,              MVT::v16f32, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v16f32, Legal);
-      setOperationAction(ISD::FP_EXTEND,              MVT::v8f64,  Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v8f64,  Legal);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);
-
-      setOperationAction(ISD::FP_TO_SINT,             MVT::v32i16, Custom);
-      setOperationAction(ISD::STRICT_FP_TO_SINT,      MVT::v32i16, Custom);
-      setOperationAction(ISD::FP_TO_UINT,             MVT::v32i16, Custom);
-      setOperationAction(ISD::STRICT_FP_TO_UINT,      MVT::v32i16, Custom);
-      setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i8,  MVT::v32i16);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);
+      setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);
+      setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
+      setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);
+
+      setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);
+      setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
                                  MVT::v32i16);
-      setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i8,  MVT::v32i16);
+      setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
                                  MVT::v32i16);
-      setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i1,  MVT::v32i16);
+      setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
                                  MVT::v32i16);
-      setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i1,  MVT::v32i16);
+      setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
                                  MVT::v32i16);
 
-      setOperationAction(ISD::EXTRACT_SUBVECTOR,      MVT::v16f16, Legal);
-      setOperationAction(ISD::INSERT_SUBVECTOR,       MVT::v32f16, Legal);
-      setOperationAction(ISD::CONCAT_VECTORS,         MVT::v32f16, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);
+      setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);
 
-      setLoadExtAction(ISD::EXTLOAD, MVT::v8f64,  MVT::v8f16,  Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
       setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
 
       setOperationAction(ISD::FMINIMUM, MVT::v32f16, Custom);
@@ -2375,35 +2377,35 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setGroup(MVT::v8f16);
       setGroup(MVT::v16f16);
 
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8f16,  Legal);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16f16, Custom);
-      setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Legal);
-      setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v16i16, Legal);
-      setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16,  Legal);
-      setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i16,  Legal);
-      setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Legal);
-      setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v16i16, Legal);
-      setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16,  Legal);
-      setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v8i16,  Legal);
-
-      setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
-      setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v8i16, Custom);
-      setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Custom);
-      setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
-      setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Legal);
-      setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);
-      setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v8f32, Legal);
-      setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);
+
+      setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);
+      setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);
+      setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
+      setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
 
       // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
-      setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v8f16,  Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v16f16, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);
 
-      setOperationAction(ISD::EXTRACT_SUBVECTOR,    MVT::v8f16, Legal);
-      setOperationAction(ISD::INSERT_SUBVECTOR,     MVT::v16f16, Legal);
-      setOperationAction(ISD::CONCAT_VECTORS,       MVT::v16f16, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);
+      setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);
 
       setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
       setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
@@ -2411,7 +2413,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
 
       // Need to custom widen these to prevent scalarization.
-      setOperationAction(ISD::LOAD,  MVT::v4f16, Custom);
+      setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
       setOperationAction(ISD::STORE, MVT::v4f16, Custom);
 
       setOperationAction(ISD::FMINIMUM, MVT::v8f16, Custom);
@@ -2506,52 +2508,52 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
-    setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
+    setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
-    setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
+    setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
     setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
 
-    setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
+    setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
-    setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
+    setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
     setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
 
     if (Subtarget.hasBWI()) {
-      setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
-      setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
+      setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
+      setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
     }
 
     if (Subtarget.hasFP16()) {
       // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
-      setOperationAction(ISD::FP_TO_SINT,        MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);
       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
-      setOperationAction(ISD::FP_TO_UINT,        MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);
       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
-      setOperationAction(ISD::FP_TO_SINT,        MVT::v4f16, Custom);
+      setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);
       setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
-      setOperationAction(ISD::FP_TO_UINT,        MVT::v4f16, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);
       setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
       // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
-      setOperationAction(ISD::SINT_TO_FP,        MVT::v2f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);
       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
-      setOperationAction(ISD::UINT_TO_FP,        MVT::v2f16, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);
       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
-      setOperationAction(ISD::SINT_TO_FP,        MVT::v4f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);
       setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
-      setOperationAction(ISD::UINT_TO_FP,        MVT::v4f16, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);
       setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
       // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
-      setOperationAction(ISD::FP_ROUND,          MVT::v2f16, Custom);
-      setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v2f16, Custom);
-      setOperationAction(ISD::FP_ROUND,          MVT::v4f16, Custom);
-      setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v4f16, Custom);
+      setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);
       // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
-      setOperationAction(ISD::FP_EXTEND,         MVT::v2f16, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v2f16, Custom);
-      setOperationAction(ISD::FP_EXTEND,         MVT::v4f16, Custom);
-      setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v4f16, Custom);
+      setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);
     }
   }
 
@@ -2573,7 +2575,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // FIXME: We really should do custom legalization for addition and
   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   // than generic legalization for 64-bit multiplication-with-overflow, though.
-  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
+  for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     // Add/Sub/Mul with overflow operations are custom lowered.
@@ -2853,8 +2855,9 @@ static bool isLogicOp(unsigned Opcode) {
 }
 
 static bool isTargetShuffle(unsigned Opcode) {
-  switch(Opcode) {
-  default: return false;
+  switch (Opcode) {
+  default:
+    return false;
   case X86ISD::BLENDI:
   case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
@@ -2895,7 +2898,8 @@ static bool isTargetShuffle(unsigned Opcode) {
 
 static bool isTargetShuffleVariableMask(unsigned Opcode) {
   switch (Opcode) {
-  default: return false;
+  default:
+    return false;
   // Target Shuffles.
   case X86ISD::PSHUFB:
   case X86ISD::VPERMILPV:
@@ -2921,9 +2925,8 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   if (ReturnAddrIndex == 0) {
     // Set up a frame object for the return address.
     unsigned SlotSize = RegInfo->getSlotSize();
-    ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
-                                                          -(int64_t)SlotSize,
-                                                          false);
+    ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(
+        SlotSize, -(int64_t)SlotSize, false);
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
 
@@ -2981,7 +2984,7 @@ static bool isX86CCSigned(X86::CondCode X86CC) {
 
 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
   switch (SetCCOpcode) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Invalid integer condition!");
   case ISD::SETEQ:  return X86::COND_E;
   case ISD::SETGT:  return X86::COND_G;
@@ -2993,7 +2996,7 @@ static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
   case ISD::SETUGT: return X86::COND_A;
   case ISD::SETULE: return X86::COND_BE;
   case ISD::SETUGE: return X86::COND_AE;
-  // clang-format on
+    // clang-format on
   }
 }
 
@@ -3031,14 +3034,14 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
   // First determine if it is required or is profitable to flip the operands.
 
   // If LHS is a foldable load, but RHS is not, flip the condition.
-  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
-      !ISD::isNON_EXTLoad(RHS.getNode())) {
+  if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) {
     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
     std::swap(LHS, RHS);
   }
 
   switch (SetCCOpcode) {
-  default: break;
+  default:
+    break;
   case ISD::SETOLT:
   case ISD::SETOLE:
   case ISD::SETUGT:
@@ -3054,7 +3057,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
   //  1 | 0 | 0 | X == Y
   //  1 | 1 | 1 | unordered
   switch (SetCCOpcode) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Condcode should be pre-legalized away");
   case ISD::SETUEQ:
   case ISD::SETEQ:   return X86::COND_E;
@@ -3076,7 +3079,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
   case ISD::SETO:    return X86::COND_NP;
   case ISD::SETOEQ:
   case ISD::SETUNE:  return X86::COND_INVALID;
-  // clang-format on
+    // clang-format on
   }
 }
 
@@ -3111,7 +3114,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   Info.flags = MachineMemOperand::MONone;
   Info.offset = 0;
 
-  const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
+  const IntrinsicData *IntrData = getIntrinsicWithChain(Intrinsic);
   if (!IntrData) {
     switch (Intrinsic) {
     case Intrinsic::x86_aesenc128kl:
@@ -3204,7 +3207,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case TRUNCATE_TO_MEM_VI32: {
     Info.opc = ISD::INTRINSIC_VOID;
     Info.ptrVal = I.getArgOperand(0);
-    MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
+    MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
       ScalarVT = MVT::i8;
@@ -3224,8 +3227,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = nullptr;
     MVT DataVT = MVT::getVT(I.getType());
     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
-    unsigned NumElts = std::min(DataVT.getVectorNumElements(),
-                                IndexVT.getVectorNumElements());
+    unsigned NumElts =
+        std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements());
     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
     Info.align = Align(1);
     Info.flags |= MachineMemOperand::MOLoad;
@@ -3236,8 +3239,8 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = nullptr;
     MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
     MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
-    unsigned NumElts = std::min(DataVT.getVectorNumElements(),
-                                IndexVT.getVectorNumElements());
+    unsigned NumElts =
+        std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements());
     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
     Info.align = Align(1);
     Info.flags |= MachineMemOperand::MOStore;
@@ -3396,8 +3399,9 @@ bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
   // Mask vectors support all subregister combinations and operations that
   // extract half of vector.
   if (ResVT.getVectorElementType() == MVT::i1)
-    return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
-                          (Index == ResVT.getVectorNumElements()));
+    return Index == 0 ||
+           ((ResVT.getSizeInBits() == SrcVT.getSizeInBits() * 2) &&
+            (Index == ResVT.getVectorNumElements()));
 
   return (Index % ResVT.getVectorNumElements()) == 0;
 }
@@ -3457,9 +3461,9 @@ bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {
          (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
 }
 
-bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
-                                                const SelectionDAG &DAG,
-                                                const MachineMemOperand &MMO) const {
+bool X86TargetLowering::isLoadBitCastBeneficial(
+    EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG,
+    const MachineMemOperand &MMO) const {
   if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
       BitcastVT.getVectorElementType() == MVT::i1)
     return false;
@@ -3468,8 +3472,8 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
     return false;
 
   // If both types are legal vectors, it's always ok to convert them.
-  if (LoadVT.isVector() && BitcastVT.isVector() &&
-      isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
+  if (LoadVT.isVector() && BitcastVT.isVector() && isTypeLegal(LoadVT) &&
+      isTypeLegal(BitcastVT))
     return true;
 
   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
@@ -3493,9 +3497,7 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
   return true;
 }
 
-bool X86TargetLowering::isCtlzFast() const {
-  return Subtarget.hasFastLZCNT();
-}
+bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); }
 
 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
     const Instruction &AndI) const {
@@ -3917,8 +3919,7 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
   return true;
 }
 
-static bool canWidenShuffleElements(ArrayRef<int> Mask,
-                                    const APInt &Zeroable,
+static bool canWidenShuffleElements(ArrayRef<int> Mask, const APInt &Zeroable,
                                     bool V2IsZero,
                                     SmallVectorImpl<int> &WidenedMask) {
   // Create an alternative mask with info about zeroable elements.
@@ -4002,7 +4003,7 @@ bool X86::isZeroNode(SDValue Elt) {
 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
                               const SDLoc &dl, bool IsMask = false) {
 
-  SmallVector<SDValue, 32>  Ops;
+  SmallVector<SDValue, 32> Ops;
   bool Split = false;
 
   MVT ConstVecVT = VT;
@@ -4016,12 +4017,12 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
   MVT EltVT = ConstVecVT.getVectorElementType();
   for (unsigned i = 0; i < NumElts; ++i) {
     bool IsUndef = Values[i] < 0 && IsMask;
-    SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
-      DAG.getConstant(Values[i], dl, EltVT);
+    SDValue OpNode =
+        IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(Values[i], dl, EltVT);
     Ops.push_back(OpNode);
     if (Split)
-      Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
-                    DAG.getConstant(0, dl, EltVT));
+      Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT)
+                            : DAG.getConstant(0, dl, EltVT));
   }
   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
   if (Split)
@@ -4029,8 +4030,8 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
   return ConstsNode;
 }
 
-static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
-                              MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
+static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs, MVT VT,
+                              SelectionDAG &DAG, const SDLoc &dl) {
   assert(Bits.size() == Undefs.getBitWidth() &&
          "Unequal constant and undef arrays");
   SmallVector<SDValue, 32> Ops;
@@ -4065,8 +4066,8 @@ static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
   return DAG.getBitcast(VT, ConstsNode);
 }
 
-static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT,
-                              SelectionDAG &DAG, const SDLoc &dl) {
+static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT, SelectionDAG &DAG,
+                              const SDLoc &dl) {
   APInt Undefs = APInt::getZero(Bits.size());
   return getConstVector(Bits, Undefs, VT, DAG, dl);
 }
@@ -4165,7 +4166,8 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
                                    SelectionDAG &DAG, const SDLoc &dl) {
   assert((Vec.getValueType().is256BitVector() ||
-          Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+          Vec.getValueType().is512BitVector()) &&
+         "Unexpected vector size!");
   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
 }
 
@@ -4189,7 +4191,7 @@ static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   EVT ResultVT = Result.getValueType();
 
   // Insert the relevant vectorWidth bits.
-  unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
+  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
 
   // This is the index of the first element of the vectorWidth-bit chunk
@@ -4587,8 +4589,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
     // May need to promote to a legal type.
     Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                     DAG.getConstant(0, dl, WideOpVT),
-                     SubVec, Idx);
+                     DAG.getConstant(0, dl, WideOpVT), SubVec, Idx);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   }
 
@@ -4603,20 +4604,18 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   if (IdxVal == 0) {
     // Zero lower bits of the Vec
     SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
-    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
-                      ZeroIdx);
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
     // Merge them together, SubVec should be zero extended.
     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                         DAG.getConstant(0, dl, WideOpVT),
-                         SubVec, ZeroIdx);
+                         DAG.getConstant(0, dl, WideOpVT), SubVec, ZeroIdx);
     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   }
 
-  SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                       Undef, SubVec, ZeroIdx);
+  SubVec =
+      DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, SubVec, ZeroIdx);
 
   if (Vec.isUndef()) {
     assert(IdxVal != 0 && "Unexpected index");
@@ -4654,12 +4653,11 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
       // isel to optimize when bits are known zero.
       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                        DAG.getConstant(0, dl, WideOpVT),
-                        Vec, ZeroIdx);
+                        DAG.getConstant(0, dl, WideOpVT), Vec, ZeroIdx);
     } else {
       // Otherwise use explicit shifts to zero the bits.
-      Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
-                        Undef, Vec, ZeroIdx);
+      Vec =
+          DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
       NumElems = WideOpVT.getVectorNumElements();
       SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
       Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
@@ -4712,9 +4710,9 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   // Isolate the bits after the last inserted bit.
   unsigned HighShift = IdxVal + SubVecNumElems;
   SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
-                            DAG.getTargetConstant(HighShift, dl, MVT::i8));
+                             DAG.getTargetConstant(HighShift, dl, MVT::i8));
   High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
-                    DAG.getTargetConstant(HighShift, dl, MVT::i8));
+                     DAG.getTargetConstant(HighShift, dl, MVT::i8));
 
   // Now OR all 3 pieces together.
   Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
@@ -4795,8 +4793,8 @@ static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
   return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
 }
 
-void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,
-                                   bool Lo, bool Unary) {
+void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+                                   bool Unary) {
   assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
          "Illegal vector type to unpack");
   assert(Mask.empty() && "Expected an empty shuffle mask vector");
@@ -4933,13 +4931,12 @@ static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
 /// This produces a shuffle where the low element of V2 is swizzled into the
 /// zero/undef vector, landing at element Idx.
 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
-static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
-                                           bool IsZero,
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
   MVT VT = V2.getSimpleValueType();
-  SDValue V1 = IsZero
-    ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
+  SDValue V1 =
+      IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   int NumElems = VT.getVectorNumElements();
   SmallVector<int, 16> MaskVec(NumElems);
   for (int i = 0; i != NumElems; ++i)
@@ -5832,9 +5829,9 @@ static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
 /// as many lanes with this technique as possible to simplify the remaining
 /// shuffle.
-static void computeZeroableShuffleElements(ArrayRef<int> Mask,
-                                           SDValue V1, SDValue V2,
-                                           APInt &KnownUndef, APInt &KnownZero) {
+static void computeZeroableShuffleElements(ArrayRef<int> Mask, SDValue V1,
+                                           SDValue V2, APInt &KnownUndef,
+                                           APInt &KnownZero) {
   int Size = Mask.size();
   KnownUndef = KnownZero = APInt::getZero(Size);
 
@@ -6020,7 +6017,7 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
 static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
                                               const APInt &KnownUndef,
                                               const APInt &KnownZero,
-                                              bool ResolveKnownZeros= true) {
+                                              bool ResolveKnownZeros = true) {
   unsigned NumElts = Mask.size();
   assert(KnownUndef.getBitWidth() == NumElts &&
          KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
@@ -6990,8 +6987,8 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL,
     MVT EltVT = VT.getVectorElementType();
     // Create a new build vector with the first 2 elements followed by undef
     // padding, bitcast to v2f64, duplicate, and bitcast back.
-    SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
-                       DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
+    SDValue Ops[4] = {Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT),
+                      DAG.getUNDEF(EltVT)};
     SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
     SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
     return DAG.getBitcast(VT, Dup);
@@ -7039,7 +7036,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL,
   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
     if (Zeroable[EltIdx]) {
       // The zero vector will be on the right hand side.
-      Mask[EltIdx] = EltIdx+4;
+      Mask[EltIdx] = EltIdx + 4;
       continue;
     }
 
@@ -7250,7 +7247,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   APInt ZeroMask = APInt::getZero(NumElems);
   APInt UndefMask = APInt::getZero(NumElems);
 
-  SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
+  SmallVector<LoadSDNode *, 8> Loads(NumElems, nullptr);
   SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
 
   // For each element in the initializer, see if we've found a load, zero or an
@@ -7345,8 +7342,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
 
   auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
     auto MMOFlags = LDBase->getMemOperand()->getFlags();
-    assert(LDBase->isSimple() &&
-           "Cannot merge volatile or atomic loads.");
+    assert(LDBase->isSimple() && "Cannot merge volatile or atomic loads.");
     SDValue NewLd =
         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                     LDBase->getPointerInfo(), LDBase->getBaseAlign(), MMOFlags);
@@ -7434,7 +7430,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
       VecVT = MVT::v4f32;
     if (TLI.isTypeLegal(VecVT)) {
       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
-      SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+      SDValue Ops[] = {LDBase->getChain(), LDBase->getBasePtr()};
       SDValue ResNode = DAG.getMemIntrinsicNode(
           X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
           LDBase->getBaseAlign(), MachineMemOperand::MOLoad);
@@ -8038,9 +8034,9 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
     } else {
       MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
-      SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
-                                     DAG.getAllOnesConstant(dl, ImmVT),
-                                     DAG.getConstant(0, dl, ImmVT));
+      SDValue Select =
+          DAG.getSelect(dl, ImmVT, Cond, DAG.getAllOnesConstant(dl, ImmVT),
+                        DAG.getConstant(0, dl, ImmVT));
       MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
       Select = DAG.getBitcast(VecVT, Select);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
@@ -8150,10 +8146,10 @@ static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
     // Try to match the following pattern:
     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-        Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-        Op0.getOperand(0) == Op1.getOperand(0) &&
-        isa<ConstantSDNode>(Op0.getOperand(1)) &&
-        isa<ConstantSDNode>(Op1.getOperand(1)));
+               Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+               Op0.getOperand(0) == Op1.getOperand(0) &&
+               isa<ConstantSDNode>(Op0.getOperand(1)) &&
+               isa<ConstantSDNode>(Op1.getOperand(1)));
     if (!CanFold)
       break;
 
@@ -8233,9 +8229,9 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
 
   unsigned NumElts = VT.getVectorNumElements();
   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
-  SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
+  SDValue V0_HI = extract128BitVector(V0, NumElts / 2, DAG, DL);
   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
-  SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
+  SDValue V1_HI = extract128BitVector(V1, NumElts / 2, DAG, DL);
   MVT NewVT = V0_LO.getSimpleValueType();
 
   SDValue LO = DAG.getUNDEF(NewVT);
@@ -8267,8 +8263,7 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
                              const X86Subtarget &Subtarget, SelectionDAG &DAG,
                              SDValue &Opnd0, SDValue &Opnd1,
-                             unsigned &NumExtracts,
-                             bool &IsSubAdd) {
+                             unsigned &NumExtracts, bool &IsSubAdd) {
 
   MVT VT = BV->getSimpleValueType(0);
   if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
@@ -8354,8 +8349,8 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
   // Ensure we have found an opcode for both parities and that they are
   // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
   // inputs are undef.
-  if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
-      InVec0.isUndef() || InVec1.isUndef())
+  if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] || InVec0.isUndef() ||
+      InVec1.isUndef())
     return false;
 
   IsSubAdd = Opc[0] == ISD::FADD;
@@ -8368,7 +8363,8 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
 /// Returns true if is possible to fold MUL and an idiom that has already been
 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
-/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
+/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1,
+/// \p Opnd2.
 ///
 /// Prior to calling this function it should be known that there is some
 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
@@ -8392,8 +8388,8 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
 /// FMADDSUB is.
 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
-                                 SelectionDAG &DAG,
-                                 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
+                                 SelectionDAG &DAG, SDValue &Opnd0,
+                                 SDValue &Opnd1, SDValue &Opnd2,
                                  unsigned ExpectedUses) {
   if (Opnd0.getOpcode() != ISD::FMUL ||
       !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
@@ -8448,8 +8444,8 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
   if (VT.is512BitVector()) {
     SmallVector<int> Mask;
     for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
-        Mask.push_back(I);
-        Mask.push_back(I + E + 1);
+      Mask.push_back(I);
+      Mask.push_back(I + E + 1);
     }
     SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
     SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
@@ -8490,13 +8486,13 @@ static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
       if (HOpcode == ISD::DELETED_NODE) {
         GenericOpcode = Op.getOpcode();
         switch (GenericOpcode) {
-        // clang-format off
+          // clang-format off
         case ISD::ADD: HOpcode = X86ISD::HADD; break;
         case ISD::SUB: HOpcode = X86ISD::HSUB; break;
         case ISD::FADD: HOpcode = X86ISD::FHADD; break;
         case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
         default: return false;
-        // clang-format on
+          // clang-format on
         }
       }
 
@@ -8526,8 +8522,7 @@ static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
       // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
       unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
       unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
-      unsigned ExpectedIndex = i * NumEltsIn128Bits +
-                               (j % NumEltsIn64Bits) * 2;
+      unsigned ExpectedIndex = i * NumEltsIn128Bits + (j % NumEltsIn64Bits) * 2;
       if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
         continue;
 
@@ -9171,8 +9166,8 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL,
   return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
 }
 
-SDValue
-X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                             SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   MVT VT = Op.getSimpleValueType();
@@ -9396,14 +9391,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     }
 
     // Is it a vector logical left shift?
-    if (NumElems == 2 && Idx == 1 &&
-        X86::isZeroNode(Op.getOperand(0)) &&
+    if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) &&
         !X86::isZeroNode(Op.getOperand(1))) {
       unsigned NumBits = VT.getSizeInBits();
-      return getVShift(true, VT,
-                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                   VT, Op.getOperand(1)),
-                       NumBits/2, DAG, *this, dl);
+      return getVShift(
+          true, VT,
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)),
+          NumBits / 2, DAG, *this, dl);
     }
 
     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
@@ -9416,7 +9410,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     // place.
     if (EVTBits == 32) {
       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
-      return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
+      return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget,
+                                         DAG);
     }
   }
 
@@ -9455,8 +9450,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   // build_vector and broadcast it.
   // TODO: We could probably generalize this more.
   if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
-    SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
-                       DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
+    SDValue Ops[4] = {Op.getOperand(0), Op.getOperand(1), DAG.getUNDEF(EltVT),
+                      DAG.getUNDEF(EltVT)};
     auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
       // Make sure all the even/odd operands match.
       for (unsigned i = 2; i != NumElems; ++i)
@@ -9472,8 +9467,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
                                      DAG.getBuildVector(NarrowVT, dl, Ops));
       // Broadcast from v2i64/v2f64 and cast to final VT.
       MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
-      return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
-                                            NewBV));
+      return DAG.getBitcast(
+          VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, NewBV));
     }
   }
 
@@ -9486,7 +9481,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     SDValue Lower =
         DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
     SDValue Upper = DAG.getBuildVector(
-        HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
+        HVT, dl, Op->ops().slice(NumElems / 2, NumElems / 2));
 
     // Recreate the wider vector with the lower and upper part.
     return concatSubVectors(Lower, Upper, DAG, dl);
@@ -9497,8 +9492,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     if (NumNonZero == 1) {
       // One half is zero or undef.
       unsigned Idx = NonZeroMask.countr_zero();
-      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
-                               Op.getOperand(Idx));
+      SDValue V2 =
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx));
       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
     }
     return SDValue();
@@ -9533,30 +9528,28 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
     for (unsigned i = 0; i < 2; ++i) {
       switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
-        default: llvm_unreachable("Unexpected NonZero count");
-        case 0:
-          Ops[i] = Ops[i*2];  // Must be a zero vector.
-          break;
-        case 1:
-          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
-          break;
-        case 2:
-          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
-          break;
-        case 3:
-          Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
-          break;
+      default:
+        llvm_unreachable("Unexpected NonZero count");
+      case 0:
+        Ops[i] = Ops[i * 2]; // Must be a zero vector.
+        break;
+      case 1:
+        Ops[i] = getMOVL(DAG, dl, VT, Ops[i * 2 + 1], Ops[i * 2]);
+        break;
+      case 2:
+        Ops[i] = getMOVL(DAG, dl, VT, Ops[i * 2], Ops[i * 2 + 1]);
+        break;
+      case 3:
+        Ops[i] = getUnpackl(DAG, dl, VT, Ops[i * 2], Ops[i * 2 + 1]);
+        break;
       }
     }
 
     bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
     bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
-    int MaskVec[] = {
-      Reverse1 ? 1 : 0,
-      Reverse1 ? 0 : 1,
-      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
-      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
-    };
+    int MaskVec[] = {Reverse1 ? 1 : 0, Reverse1 ? 0 : 1,
+                     static_cast<int>(Reverse2 ? NumElems + 1 : NumElems),
+                     static_cast<int>(Reverse2 ? NumElems : NumElems + 1)};
     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
   }
 
@@ -9575,7 +9568,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       Result = DAG.getUNDEF(VT);
 
     for (unsigned i = 1; i < NumElems; ++i) {
-      if (Op.getOperand(i).isUndef()) continue;
+      if (Op.getOperand(i).isUndef())
+        continue;
       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
                            Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
     }
@@ -9600,14 +9594,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
     // Generate scaled UNPCKL shuffle mask.
     SmallVector<int, 16> Mask;
-    for(unsigned i = 0; i != Scale; ++i)
+    for (unsigned i = 0; i != Scale; ++i)
       Mask.push_back(i);
     for (unsigned i = 0; i != Scale; ++i)
-      Mask.push_back(NumElems+i);
+      Mask.push_back(NumElems + i);
     Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
 
     for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
-      Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
+      Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2 * i], Ops[(2 * i) + 1], Mask);
   }
   return Ops[0];
 }
@@ -9620,8 +9614,8 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
 
-  assert((ResVT.is256BitVector() ||
-          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
+  assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
+         "Value type must be 256-/512-bit wide");
 
   unsigned NumOperands = Op.getNumOperands();
   unsigned NumFreezeUndef = 0;
@@ -9634,15 +9628,14 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
     if (SubVec.isUndef())
       continue;
     if (ISD::isFreezeUndef(SubVec.getNode())) {
-        // If the freeze(undef) has multiple uses then we must fold to zero.
-        if (SubVec.hasOneUse()) {
-          ++NumFreezeUndef;
-        } else {
-          ++NumZero;
-          Undefs.insert(SubVec);
-        }
-    }
-    else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+      // If the freeze(undef) has multiple uses then we must fold to zero.
+      if (SubVec.hasOneUse()) {
+        ++NumFreezeUndef;
+      } else {
+        ++NumZero;
+        Undefs.insert(SubVec);
+      }
+    } else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
       ++NumZero;
     else {
       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
@@ -9656,9 +9649,9 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
     MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
     ArrayRef<SDUse> Ops = Op->ops();
     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
-                             Ops.slice(0, NumOperands/2));
+                             Ops.slice(0, NumOperands / 2));
     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
-                             Ops.slice(NumOperands/2));
+                             Ops.slice(NumOperands / 2));
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   }
 
@@ -9691,7 +9684,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
                                        const X86Subtarget &Subtarget,
-                                       SelectionDAG & DAG) {
+                                       SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
   unsigned NumOperands = Op.getNumOperands();
@@ -9764,16 +9757,15 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
                      DAG.getVectorIdxConstant(NumElems / 2, dl));
 }
 
-static SDValue LowerCONCAT_VECTORS(SDValue Op,
-                                   const X86Subtarget &Subtarget,
+static SDValue LowerCONCAT_VECTORS(SDValue Op, const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   if (VT.getVectorElementType() == MVT::i1)
     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
 
   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
-         (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
-          Op.getNumOperands() == 4)));
+         (VT.is512BitVector() &&
+          (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
 
   // AVX can use the vinsertf128 instruction to create 256-bit vectors
   // from two other 128-bit ones.
@@ -9890,8 +9882,8 @@ static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
 
     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
     // Adjust second vector indices to start at LaneSize instead of Size.
-    int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
-                                : Mask[i] % LaneSize + LaneSize;
+    int LocalM =
+        Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
     if (RepeatedMask[i % LaneSize] < 0)
       // This is the first non-undef entry in this slot of a 128-bit lane.
       RepeatedMask[i % LaneSize] = LocalM;
@@ -9909,8 +9901,7 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
 }
 
-static bool
-is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
+static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
   SmallVector<int, 32> RepeatedMask;
   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
 }
@@ -10296,8 +10287,8 @@ static SDValue getSHUFPDImmForMask(ArrayRef<int> Mask, const SDLoc &DL,
 //
 // The function looks for a sub-mask that the nonzero elements are in
 // increasing order. If such sub-mask exist. The function returns true.
-static bool isNonZeroElementsInOrder(const APInt &Zeroable,
-                                     ArrayRef<int> Mask, const EVT &VectorType,
+static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef<int> Mask,
+                                     const EVT &VectorType,
                                      bool &IsZeroSideLeft) {
   int NextElement = -1;
   // Check if the Mask's nonzero elements are in increasing order.
@@ -11082,7 +11073,7 @@ static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,
       if (M == SM_SentinelUndef)
         continue;
       if (M == Elt || (0 <= M && M < NumElts &&
-                     IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
+                       IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
         Mask[Elt] = Elt;
         LaneV1InUse = true;
         continue;
@@ -11215,8 +11206,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
 
     // If we have VPTERNLOG, we can use that as a bit blend.
     if (Subtarget.hasVLX())
-      if (SDValue BitBlend =
-              lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+      if (SDValue BitBlend = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
         return BitBlend;
 
     // Scale the blend by the number of bytes per element.
@@ -11524,9 +11514,11 @@ static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
 
 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
 /// permuting the elements of the result in place.
-static SDValue lowerShuffleAsByteRotateAndPermute(
-    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT,
+                                                  SDValue V1, SDValue V2,
+                                                  ArrayRef<int> Mask,
+                                                  const X86Subtarget &Subtarget,
+                                                  SelectionDAG &DAG) {
   if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
       (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
       (VT.is512BitVector() && !Subtarget.hasBWI()))
@@ -11709,15 +11701,15 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
   // pre-shuffle first is a better strategy.
   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
     // Only prefer immediate blends to unpack/rotate.
-    if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
-                                                          DAG, true))
+    if (SDValue BlendPerm =
+            lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true))
       return BlendPerm;
     // If either input vector provides only a single element which is repeated
     // multiple times, unpacking from both input vectors would generate worse
     // code. e.g. for
-    // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
-    // it is better to process t4 first to create a vector of t4[0], then unpack
-    // that vector with t2.
+    // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2,
+    // t4 it is better to process t4 first to create a vector of t4[0], then
+    // unpack that vector with t2.
     if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
         !isSingleElementRepeatedMask(V2Mask))
       if (SDValue UnpackPerm =
@@ -11727,8 +11719,8 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge(
             DL, VT, V1, V2, Mask, Subtarget, DAG))
       return RotatePerm;
     // Unpack/rotate failed - try again with variable blends.
-    if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
-                                                          DAG))
+    if (SDValue BlendPerm =
+            lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
       return BlendPerm;
     if (VT.getScalarSizeInBits() >= 32)
       if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
@@ -11841,7 +11833,7 @@ static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
   SDValue Lo, Hi;
   for (int i = 0; i < NumElts; ++i) {
     int M = Mask[i];
-    assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
+    assert((M == SM_SentinelUndef || (0 <= M && M < (2 * NumElts))) &&
            "Unexpected mask index.");
     if (M < 0)
       continue;
@@ -11963,8 +11955,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
          "Rotate-based lowering only supports 128-bit lowering!");
   assert(Mask.size() <= 16 &&
          "Can shuffle at most 16 bytes in a 128-bit vector!");
-  assert(ByteVT == MVT::v16i8 &&
-         "SSE2 rotate lowering only needed for v16i8!");
+  assert(ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!");
 
   // Default SSE2 implementation
   int LoByteShift = 16 - ByteRotation;
@@ -11999,8 +11990,9 @@ static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
          "Only 32-bit and 64-bit elements are supported!");
 
   // 128/256-bit vectors are only supported with VLX.
-  assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
-         && "VLX required for 128/256-bit vectors");
+  assert(
+      (Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) &&
+      "VLX required for 128/256-bit vectors");
 
   SDValue Lo = V1, Hi = V2;
   int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
@@ -12552,8 +12544,7 @@ static SDValue lowerShuffleAsSpecificExtension(const SDLoc &DL, MVT VT,
 /// are both incredibly common and often quite performance sensitive.
 static SDValue lowerShuffleAsZeroOrAnyExtend(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const APInt &Zeroable, const X86Subtarget &Subtarget,
-    SelectionDAG &DAG) {
+    const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   int Bits = VT.getSizeInBits();
   int NumLanes = Bits / 128;
   int NumElements = VT.getVectorNumElements();
@@ -12679,7 +12670,8 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
   // If the bitcasts shift the element size, we can't extract an equivalent
   // element from it.
   MVT NewVT = V.getSimpleValueType();
-  if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
+  if (!NewVT.isVector() ||
+      NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
     return SDValue();
 
   if (V.getOpcode() == ISD::BUILD_VECTOR ||
@@ -12703,7 +12695,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
          ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
 }
 
-template<typename T>
+template <typename T>
 static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
   T EltVT = VT.getScalarType();
   return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
@@ -12716,8 +12708,7 @@ static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
 /// across all subtarget feature sets.
 static SDValue lowerShuffleAsElementInsertion(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const APInt &Zeroable, const X86Subtarget &Subtarget,
-    SelectionDAG &DAG) {
+    const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   MVT ExtVT = VT;
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
@@ -12750,8 +12741,8 @@ static SDValue lowerShuffleAsElementInsertion(
   // all the smarts here sunk into that routine. However, the current
   // lowering of BUILD_VECTOR makes that nearly impossible until the old
   // vector shuffle lowering is dead.
-  SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
-                                               DAG);
+  SDValue V2S =
+      getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), DAG);
   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
     // We need to zext the scalar if it is smaller than an i32.
     V2S = DAG.getBitcast(EltVT, V2S);
@@ -12954,8 +12945,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
   // Check that both sources are extracts of the same source vector.
   if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
       N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
-      N0.getOperand(0) != N1.getOperand(0) ||
-      !N0.hasOneUse() || !N1.hasOneUse())
+      N0.getOperand(0) != N1.getOperand(0) || !N0.hasOneUse() ||
+      !N1.hasOneUse())
     return SDValue();
 
   SDValue WideVec = N0.getOperand(0);
@@ -12985,8 +12976,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
   NewMask.append(NumElts, -1);
 
   // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
-  SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
-                                      NewMask);
+  SDValue Shuf =
+      DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), NewMask);
   // This is free: ymm -> xmm.
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
                      DAG.getVectorIdxConstant(0, DL));
@@ -13185,8 +13176,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
   if (!V.getValueType().isVector()) {
     assert(V.getScalarValueSizeInBits() == NumEltBits &&
            "Unexpected scalar size");
-    MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
-                                       VT.getVectorNumElements());
+    MVT BroadcastVT =
+        MVT::getVectorVT(V.getSimpleValueType(), VT.getVectorNumElements());
     return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
   }
 
@@ -13211,8 +13202,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
 // elements are zeroable.
 static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,
                                    unsigned &InsertPSMask,
-                                   const APInt &Zeroable,
-                                   ArrayRef<int> Mask, SelectionDAG &DAG) {
+                                   const APInt &Zeroable, ArrayRef<int> Mask,
+                                   SelectionDAG &DAG) {
   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
@@ -13664,8 +13655,8 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // when the V2 input is targeting element 0 of the mask -- that is the fast
   // case here.
   if (NumV2Elements == 1 && Mask[0] >= 4)
-    if (SDValue V = lowerShuffleAsElementInsertion(
-            DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask,
+                                                   Zeroable, Subtarget, DAG))
       return V;
 
   if (Subtarget.hasSSE41()) {
@@ -13674,8 +13665,8 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       return V;
 
     if (!isSingleSHUFPSMask(Mask))
-      if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
-                                                            V2, Mask, DAG))
+      if (SDValue BlendPerm =
+              lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG))
         return BlendPerm;
   }
 
@@ -13765,8 +13756,8 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Elements == 1)
-    if (SDValue V = lowerShuffleAsElementInsertion(
-            DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, Mask,
+                                                   Zeroable, Subtarget, DAG))
       return V;
 
   // We have different paths for blend lowering, but they all must use the
@@ -13896,7 +13887,7 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
   };
 
   if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
-    int PSHUFDMask[4] = { -1, -1, -1, -1 };
+    int PSHUFDMask[4] = {-1, -1, -1, -1};
     SmallVector<std::pair<int, int>, 4> DWordPairs;
     int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
 
@@ -14000,7 +13991,8 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
     int TripleNonInputIdx =
-        TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
+        TripleInputSum -
+        std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
     TripleDWord = TripleNonInputIdx / 2;
 
     // We use xor with one to compute the adjacent DWord to whichever one the
@@ -14078,9 +14070,9 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
 
     // Adjust the mask to match the new locations of A and B.
     for (int &M : Mask)
-      if (M >= 0 && M/2 == ADWord)
+      if (M >= 0 && M / 2 == ADWord)
         M = 2 * BDWord + M % 2;
-      else if (M >= 0 && M/2 == BDWord)
+      else if (M >= 0 && M / 2 == BDWord)
         M = 2 * ADWord + M % 2;
 
     // Recurse back into this routine to re-compute state now that this isn't
@@ -14108,33 +14100,33 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
                     MutableArrayRef<int> SourceHalfMask,
                     MutableArrayRef<int> HalfMask, int HalfOffset) {
-    if (InPlaceInputs.empty())
-      return;
-    if (InPlaceInputs.size() == 1) {
-      SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
-          InPlaceInputs[0] - HalfOffset;
-      PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
-      return;
-    }
-    if (IncomingInputs.empty()) {
-      // Just fix all of the in place inputs.
-      for (int Input : InPlaceInputs) {
-        SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
-        PSHUFDMask[Input / 2] = Input / 2;
-      }
-      return;
-    }
+        if (InPlaceInputs.empty())
+          return;
+        if (InPlaceInputs.size() == 1) {
+          SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+              InPlaceInputs[0] - HalfOffset;
+          PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
+          return;
+        }
+        if (IncomingInputs.empty()) {
+          // Just fix all of the in place inputs.
+          for (int Input : InPlaceInputs) {
+            SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
+            PSHUFDMask[Input / 2] = Input / 2;
+          }
+          return;
+        }
 
-    assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
-    SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
-        InPlaceInputs[0] - HalfOffset;
-    // Put the second input next to the first so that they are packed into
-    // a dword. We find the adjacent index by toggling the low bit.
-    int AdjIndex = InPlaceInputs[0] ^ 1;
-    SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
-    llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
-    PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
-  };
+        assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
+        SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
+            InPlaceInputs[0] - HalfOffset;
+        // Put the second input next to the first so that they are packed into
+        // a dword. We find the adjacent index by toggling the low bit.
+        int AdjIndex = InPlaceInputs[0] ^ 1;
+        SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
+        llvm::replace(HalfMask, InPlaceInputs[1], AdjIndex);
+        PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
+      };
   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
 
@@ -14143,10 +14135,12 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
   // FIXME: This operation could almost certainly be simplified dramatically to
   // look more like the 3-1 fixing operation.
   auto moveInputsToRightHalf = [&PSHUFDMask](
-      MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
-      MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
-      MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
-      int DestOffset) {
+                                   MutableArrayRef<int> IncomingInputs,
+                                   ArrayRef<int> ExistingInputs,
+                                   MutableArrayRef<int> SourceHalfMask,
+                                   MutableArrayRef<int> HalfMask,
+                                   MutableArrayRef<int> FinalSourceHalfMask,
+                                   int SourceOffset, int DestOffset) {
     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
     };
@@ -14342,9 +14336,11 @@ static SDValue lowerV8I16GeneralSingleInputShuffle(
 
 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
 /// blend if only one input is used.
-static SDValue lowerShuffleAsBlendOfPSHUFBs(
-    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
+static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1,
+                                            SDValue V2, ArrayRef<int> Mask,
+                                            const APInt &Zeroable,
+                                            SelectionDAG &DAG, bool &V1InUse,
+                                            bool &V2InUse) {
   assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
          "Lane crossing shuffle masks not supported");
 
@@ -14439,8 +14435,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       return Broadcast;
 
     // Try to use bit rotation instructions.
-    if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
-                                                 Subtarget, DAG))
+    if (SDValue Rotate =
+            lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask, Subtarget, DAG))
       return Rotate;
 
     // Use dedicated unpack instructions for masks that match their pattern.
@@ -14475,14 +14471,14 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // See if we can use SSE4A Extraction / Insertion.
   if (Subtarget.hasSSE4A())
-    if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
-                                          Zeroable, DAG))
+    if (SDValue V =
+            lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, Zeroable, DAG))
       return V;
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Inputs == 1)
-    if (SDValue V = lowerShuffleAsElementInsertion(
-            DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2, Mask,
+                                                   Zeroable, Subtarget, DAG))
       return V;
 
   // We have different paths for blend lowering, but they all must use the
@@ -14598,8 +14594,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // can both shuffle and set up the inefficient blend.
   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
     bool V1InUse, V2InUse;
-    return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
-                                        Zeroable, DAG, V1InUse, V2InUse);
+    return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+                                        DAG, V1InUse, V2InUse);
   }
 
   // We can always bit-blend if we have to so the fallback strategy is to
@@ -14732,8 +14728,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // See if we can use SSE4A Extraction / Insertion.
   if (Subtarget.hasSSE4A())
-    if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
-                                          Zeroable, DAG))
+    if (SDValue V =
+            lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG))
       return V;
 
   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
@@ -14746,8 +14742,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       return Broadcast;
 
     // Try to use bit rotation instructions.
-    if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
-                                                 Subtarget, DAG))
+    if (SDValue Rotate =
+            lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask, Subtarget, DAG))
       return Rotate;
 
     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
@@ -14788,7 +14784,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
       SmallDenseMap<int, int, 8> LaneMap;
       for (int I : InPlaceInputs) {
-        PreDupI16Shuffle[I/2] = I/2;
+        PreDupI16Shuffle[I / 2] = I / 2;
         LaneMap[I] = I;
       }
       int j = TargetLo ? 0 : 4, je = j + 4;
@@ -14802,7 +14798,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
             ++j;
 
           if (j == je)
-            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
+            // We can't place the inputs into a single half with a simple i16
+            // shuffle, so bail.
             return SDValue();
 
           // Map this input with the i16 shuffle.
@@ -14923,8 +14920,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
       // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
       // PALIGNR will be cheaper than the second PSHUFB+OR.
-      if (SDValue V = lowerShuffleAsByteRotateAndPermute(
-              DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+      if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v16i8, V1, V2,
+                                                         Mask, Subtarget, DAG))
         return V;
     }
 
@@ -14933,8 +14930,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Elements == 1)
-    if (SDValue V = lowerShuffleAsElementInsertion(
-            DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+    if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2, Mask,
+                                                   Zeroable, Subtarget, DAG))
       return V;
 
   if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
@@ -15026,8 +15023,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       if (M >= 0)
         M /= 2;
   } else {
-    // Otherwise just unpack the low half of V into VLoHalf and the high half into
-    // VHiHalf so that we can blend them as i16s.
+    // Otherwise just unpack the low half of V into VLoHalf and the high half
+    // into VHiHalf so that we can blend them as i16s.
     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
 
     VLoHalf = DAG.getBitcast(
@@ -15036,8 +15033,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
   }
 
-  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
-  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
+  SDValue LoV =
+      DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
+  SDValue HiV =
+      DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
 
   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
 }
@@ -15046,9 +15045,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 ///
 /// This routine breaks down the specific type of 128-bit shuffle and
 /// dispatches to the lowering routines accordingly.
-static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                  MVT VT, SDValue V1, SDValue V2,
-                                  const APInt &Zeroable,
+static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                  SDValue V1, SDValue V2, const APInt &Zeroable,
                                   const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG) {
   if (VT == MVT::v8bf16) {
@@ -15219,7 +15217,7 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
                                           const X86Subtarget &Subtarget,
                                           SelectionDAG &DAG) {
   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
-         "shuffles as it could then recurse on itself.");
+                          "shuffles as it could then recurse on itself.");
   int Size = Mask.size();
 
   // If this can be modeled as a broadcast of two elements followed by a blend,
@@ -15558,8 +15556,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
   // instruction bytes needed to explicitly generate the zero vector.
 
   // Blends are faster and handle all the non-lane-crossing cases.
-  if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
-                                          Subtarget, DAG))
+  if (SDValue Blend =
+          lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return Blend;
 
   // If either input operand is a zero vector, use VPERM2X128 because its mask
@@ -15585,8 +15583,8 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
     // Try to use SHUF128 if possible.
     if (Subtarget.hasVLX()) {
       if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
-        unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
-                            ((WidenedMask[1] % 2) << 1);
+        unsigned PermMask =
+            ((WidenedMask[0] % 2) << 0) | ((WidenedMask[1] % 2) << 1);
         return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
                            DAG.getTargetConstant(PermMask, DL, MVT::i8));
       }
@@ -15610,7 +15608,7 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
          (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
 
   unsigned PermMask = 0;
-  PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
+  PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
   PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
 
   // Check the immediate mask and replace unused sources with undef.
@@ -15802,9 +15800,9 @@ static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(
 /// adjusted to access the extracted halves of the original shuffle operands is
 /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
 /// lower half of each input operand is accessed.
-static bool
-getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
-                   int &HalfIdx1, int &HalfIdx2) {
+static bool getHalfShuffleMask(ArrayRef<int> Mask,
+                               MutableArrayRef<int> HalfMask, int &HalfIdx1,
+                               int &HalfIdx2) {
   assert((Mask.size() == HalfMask.size() * 2) &&
          "Expected input mask to be twice as long as output");
 
@@ -15857,7 +15855,8 @@ getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,
 static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,
                                      ArrayRef<int> HalfMask, int HalfIdx1,
                                      int HalfIdx2, bool UndefLower,
-                                     SelectionDAG &DAG, bool UseConcat = false) {
+                                     SelectionDAG &DAG,
+                                     bool UseConcat = false) {
   assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
   assert(V1.getValueType().isSimple() && "Expecting only simple types");
 
@@ -16219,7 +16218,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
   assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
          "Illegal shuffle mask");
 
-  bool ZeroLane[2] = { true, true };
+  bool ZeroLane[2] = {true, true};
   for (int i = 0; i < NumElts; ++i)
     ZeroLane[i & 1] &= Zeroable[i];
 
@@ -16304,9 +16303,9 @@ static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
 
   // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
   // the upper bits of the result using an unpckldq.
-  SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
-                                        { 0, 1, 2, 3, 16, 17, 18, 19,
-                                          4, 5, 6, 7, 20, 21, 22, 23 });
+  SDValue Unpack = DAG.getVectorShuffle(
+      MVT::v16i8, DL, V1, V2,
+      {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23});
   // Insert the unpckldq into a zero vector to widen to v32i8.
   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
                      DAG.getConstant(0, DL, MVT::v32i8), Unpack,
@@ -16543,8 +16542,8 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Blend;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
-                                                  Subtarget, DAG))
+  if (SDValue Broadcast =
+          lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
   // Try to use shift instructions if fast.
@@ -16651,8 +16650,8 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Blend;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
-                                                  Subtarget, DAG))
+  if (SDValue Broadcast =
+          lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
   if (!Subtarget.hasAVX2()) {
@@ -16799,8 +16798,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Blend;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
-                                                  Subtarget, DAG))
+  if (SDValue Broadcast =
+          lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
   // Try to use shift instructions if fast.
@@ -16967,7 +16966,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
     // Try to produce a fixed cross-128-bit lane permute followed by unpack
     // because that should be faster than the variable permute alternatives.
-    if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
+    if (SDValue V =
+            lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
       return V;
 
     // There are no generalized cross-lane shuffle operations available on i16
@@ -16986,8 +16986,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
       // As this is a single-input shuffle, the repeated mask should be
       // a strictly valid v8i16 mask that we can pass through to the v8i16
       // lowering to handle even the v16 case.
-      return lowerV8I16GeneralSingleInputShuffle(
-          DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
+      return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v16i16, V1,
+                                                 RepeatedMask, Subtarget, DAG);
     }
   }
 
@@ -17006,8 +17006,8 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Result;
 
   // Try to permute the lanes and then use a per-lane permute.
-  if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
-          DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
+  if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v16i16, V1, V2,
+                                                      Mask, DAG, Subtarget))
     return V;
 
   // Try to match an interleave of two v16i16s and lower them as unpck and
@@ -17043,8 +17043,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return ZExt;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
-                                                  Subtarget, DAG))
+  if (SDValue Broadcast =
+          lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
@@ -17096,8 +17096,8 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
       return V;
 
-    if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
-            DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
+    if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v32i8, V1, V2,
+                                                        Mask, DAG, Subtarget))
       return V;
 
     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
@@ -17119,16 +17119,16 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Result;
 
   // Try to permute the lanes and then use a per-lane permute.
-  if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
-          DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
+  if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v32i8, V1, V2,
+                                                      Mask, DAG, Subtarget))
     return V;
 
   // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
   // by zeroable elements in the remaining 24 elements. Turn this into two
   // vmovqb instructions shuffled together.
   if (Subtarget.hasVLX())
-    if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
-                                                  Mask, Zeroable, DAG))
+    if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, Mask,
+                                                  Zeroable, DAG))
       return V;
 
   // Try to match an interleave of two v32i8s and lower them as unpck and
@@ -17183,7 +17183,8 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
         return V;
       if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
         return V;
-      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
+      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
+                                  /*SimpleOnly*/ false);
     }
 
     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
@@ -17432,8 +17433,7 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // If we have a single input shuffle with different shuffle patterns in the
   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
-  if (V2.isUndef() &&
-      !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
+  if (V2.isUndef() && !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
     SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
     return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
   }
@@ -17700,8 +17700,8 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
-  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
-          DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+  if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v64i8, V1, V2, Mask,
+                                                   Zeroable, Subtarget, DAG))
     return ZExt;
 
   // Use dedicated unpack instructions for masks that match their pattern.
@@ -17778,7 +17778,8 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (Subtarget.hasVBMI())
     return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
 
-  return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
+  return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG,
+                              /*SimpleOnly*/ false);
 }
 
 /// High-level routine to lower various 512-bit x86 vector shuffles.
@@ -17786,13 +17787,11 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine either breaks down the specific type of a 512-bit x86 vector
 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
 /// together based on the available instructions.
-static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                  MVT VT, SDValue V1, SDValue V2,
-                                  const APInt &Zeroable,
+static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                  SDValue V1, SDValue V2, const APInt &Zeroable,
                                   const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG) {
-  assert(Subtarget.hasAVX512() &&
-         "Cannot lower 512-bit vectors w/ basic ISA!");
+  assert(Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!");
 
   // If we have a single input to the zero element, insert that into V1 if we
   // can do so cheaply.
@@ -17810,8 +17809,8 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return V;
 
   // Check for being able to broadcast a single element.
-  if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
-                                                  Subtarget, DAG))
+  if (SDValue Broadcast =
+          lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
   if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
@@ -17823,7 +17822,8 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
       return V;
 
-    return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
+    return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
+                                /*SimpleOnly*/ false);
   }
 
   if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
@@ -17930,14 +17930,12 @@ static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
   return -1;
 }
 
-
 // Lower vXi1 vector shuffles.
 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
 // vector, shuffle and then truncate it back.
-static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                MVT VT, SDValue V1, SDValue V2,
-                                const APInt &Zeroable,
+static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
+                                SDValue V1, SDValue V2, const APInt &Zeroable,
                                 const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   assert(Subtarget.hasAVX512() &&
@@ -18068,8 +18066,8 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   int NumElems = VT.getVectorNumElements();
   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
       (Subtarget.hasDQI() && (NumElems < 32)))
-    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
-                       Shuffle, ISD::SETGT);
+    return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT), Shuffle,
+                        ISD::SETGT);
 
   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
 }
@@ -18196,7 +18194,7 @@ static SDValue canonicalizeShuffleMaskWithHorizOp(
     unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
     const X86Subtarget &Subtarget);
 
-    /// Top-level lowering for x86 vector shuffles.
+/// Top-level lowering for x86 vector shuffles.
 ///
 /// This handles decomposition, canonicalization, and lowering of all x86
 /// vector shuffles. Most of the specific lowering strategies are encapsulated
@@ -18272,8 +18270,8 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
     // by obfuscating the operands with bitcasts.
     // TODO: Avoid lowering directly from this top-level function: make this
     // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
-    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
-                                                    Subtarget, DAG))
+    if (SDValue Broadcast =
+            lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask, Subtarget, DAG))
       return Broadcast;
 
     MVT NewEltVT = VT.isFloatingPoint()
@@ -18496,8 +18494,7 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
     // Build a mask by testing the condition against zero.
     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
-                                DAG.getConstant(0, dl, CondVT),
-                                ISD::SETNE);
+                                DAG.getConstant(0, dl, CondVT), ISD::SETNE);
     // Now return a new VSELECT using the mask.
     return DAG.getSelect(dl, VT, Mask, LHS, RHS);
   }
@@ -18602,7 +18599,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   }
 
   if (VT == MVT::i32 || VT == MVT::i64)
-      return Op;
+    return Op;
 
   return SDValue();
 }
@@ -18615,7 +18612,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Vec);
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
-  auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
+  auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
   MVT EltVT = Op.getSimpleValueType();
 
   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
@@ -18630,7 +18627,8 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
     if (NumElts == 1) {
       Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
       MVT IntVT = MVT::getIntegerVT(Vec.getValueType().getVectorNumElements());
-      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                         DAG.getBitcast(IntVT, Vec));
     }
     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
@@ -18688,14 +18686,13 @@ static APInt getExtractedDemandedElts(SDNode *N) {
   return DemandedElts;
 }
 
-SDValue
-X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
-                                           SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                   SelectionDAG &DAG) const {
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
-  auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
+  auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
 
   if (VecVT.getVectorElementType() == MVT::i1)
     return ExtractBitFromMaskVector(Op, DAG, Subtarget);
@@ -18726,10 +18723,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // |    |  Ports pressure in cycles   |  |
     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
     // ---------------------------------------------------------
-    // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
-    // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
-    // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
-    // Total Num Of Uops: 4
+    // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18],
+    // xmm0 |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18] |1
+    // |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1] Total Num
+    // Of Uops: 4
 
     return SDValue();
   }
@@ -18834,7 +18831,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // UNPCKHPD the element to the lowest double word, then movsd.
     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
-    int Mask[2] = { 1, -1 };
+    int Mask[2] = {1, -1};
     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getVectorIdxConstant(0, dl));
@@ -18859,9 +18856,10 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
     unsigned NumElts = VecVT.getVectorNumElements();
     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
-    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
-      DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
-      DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
+    SDValue ExtOp =
+        DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
+                    DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
+                    DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
   }
 
@@ -18888,9 +18886,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 
   if (EltVT == MVT::bf16) {
     MVT IVT = VT.changeVectorElementTypeToInteger();
-    SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
-                              DAG.getBitcast(IVT, N0),
-                              DAG.getBitcast(MVT::i16, N1), N2);
+    SDValue Res =
+        DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT, DAG.getBitcast(IVT, N0),
+                    DAG.getBitcast(MVT::i16, N1), N2);
     return DAG.getBitcast(VT, Res);
   }
 
@@ -19151,8 +19149,9 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
 }
 
 // Returns the appropriate wrapper opcode for a global reference.
-unsigned X86TargetLowering::getGlobalWrapperKind(
-    const GlobalValue *GV, const unsigned char OpFlags) const {
+unsigned
+X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV,
+                                        const unsigned char OpFlags) const {
   // References to absolute symbols are never PC-relative.
   if (GV && GV->isAbsoluteSymbolRef())
     return X86ISD::Wrapper;
@@ -19176,8 +19175,8 @@ unsigned X86TargetLowering::getGlobalWrapperKind(
 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
 // be used to form addressing mode. These wrapped nodes will be selected
 // into MOV32ri.
-SDValue
-X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerConstantPool(SDValue Op,
+                                             SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
 
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
@@ -19227,11 +19226,10 @@ SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
 }
 
-SDValue
-X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerBlockAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
   // Create the TargetBlockAddressAddress node.
-  unsigned char OpFlags =
-    Subtarget.classifyBlockAddressReference();
+  unsigned char OpFlags = Subtarget.classifyBlockAddressReference();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   SDLoc dl(Op);
@@ -19336,8 +19334,8 @@ SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
   return Result;
 }
 
-SDValue
-X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
   return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false, nullptr);
 }
 
@@ -19415,24 +19413,24 @@ static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA,
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
-static SDValue
-LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                                const EVT PtrVT) {
+static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA,
+                                               SelectionDAG &DAG,
+                                               const EVT PtrVT) {
   return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
                     /*LoadGlobalBaseReg=*/true);
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
-static SDValue
-LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                                const EVT PtrVT) {
+static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA,
+                                               SelectionDAG &DAG,
+                                               const EVT PtrVT) {
   return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
-static SDValue
-LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
-                                 const EVT PtrVT) {
+static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA,
+                                                SelectionDAG &DAG,
+                                                const EVT PtrVT) {
   return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
 }
 
@@ -19464,9 +19462,8 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
   // Build x at dtpoff.
   unsigned char OperandFlags = X86II::MO_DTPOFF;
   unsigned WrapperKind = X86ISD::Wrapper;
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
-                                           GA->getValueType(0),
-                                           GA->getOffset(), OperandFlags);
+  SDValue TGA = DAG.getTargetGlobalAddress(
+      GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
   // Add x at dtpoff with the base.
@@ -19507,9 +19504,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   // emit "addl x at ntpoff,%eax" (local exec)
   // or "addl x at indntpoff,%eax" (initial exec)
   // or "addl x at gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
-  SDValue TGA =
-      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
-                                 GA->getOffset(), OperandFlags);
+  SDValue TGA = DAG.getTargetGlobalAddress(
+      GA->getGlobal(), dl, GA->getValueType(0), GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
   if (model == TLSModel::InitialExec) {
@@ -19528,8 +19524,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
 }
 
-SDValue
-X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                 SelectionDAG &DAG) const {
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
@@ -19543,20 +19539,20 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget.isTargetELF()) {
     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
     switch (model) {
-      case TLSModel::GeneralDynamic:
-        if (Subtarget.is64Bit()) {
-          if (Subtarget.isTarget64BitLP64())
-            return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
-          return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
-        }
-        return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
-      case TLSModel::LocalDynamic:
-        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
-                                           Subtarget.isTarget64BitLP64());
-      case TLSModel::InitialExec:
-      case TLSModel::LocalExec:
-        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
-                                   PositionIndependent);
+    case TLSModel::GeneralDynamic:
+      if (Subtarget.is64Bit()) {
+        if (Subtarget.isTarget64BitLP64())
+          return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
+        return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
+      }
+      return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
+    case TLSModel::LocalDynamic:
+      return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
+                                         Subtarget.isTarget64BitLP64());
+    case TLSModel::InitialExec:
+    case TLSModel::LocalExec:
+      return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
+                                 PositionIndependent);
     }
     llvm_unreachable("Unknown TLS model.");
   }
@@ -19577,9 +19573,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
       WrapperKind = X86ISD::WrapperRIP;
     }
     SDLoc DL(Op);
-    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
-                                                GA->getValueType(0),
-                                                GA->getOffset(), OpFlag);
+    SDValue Result = DAG.getTargetGlobalAddress(
+        GA->getGlobal(), DL, GA->getValueType(0), GA->getOffset(), OpFlag);
     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
 
     // With PIC32, the address is actually $g + Offset.
@@ -19593,7 +19588,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     SDValue Chain = DAG.getEntryNode();
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
-    SDValue Args[] = { Chain, Offset };
+    SDValue Args[] = {Chain, Offset};
     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
     Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
 
@@ -19661,9 +19656,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
 
     // Get the offset of start of .tls section
-    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
-                                             GA->getValueType(0),
-                                             GA->getOffset(), X86II::MO_SECREL);
+    SDValue TGA =
+        DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+                                   GA->getOffset(), X86II::MO_SECREL);
     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
 
     // The address of the thread local variable is the add of the thread
@@ -19723,8 +19718,8 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl,
   MVT SrcVT = Src.getSimpleValueType();
   MVT VT = Op.getSimpleValueType();
 
-   if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
-       (VT != MVT::f32 && VT != MVT::f64))
+  if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
+      (VT != MVT::f32 && VT != MVT::f64))
     return SDValue();
 
   // Pack the i64 into a vector, do the operation and extract.
@@ -19789,22 +19784,22 @@ static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG,
 static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
                           const X86Subtarget &Subtarget) {
   switch (Opcode) {
-    case ISD::SINT_TO_FP:
-      // TODO: Handle wider types with AVX/AVX512.
-      if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
-        return false;
-      // CVTDQ2PS or (V)CVTDQ2PD
-      return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
-
-    case ISD::UINT_TO_FP:
-      // TODO: Handle wider types and i64 elements.
-      if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
-        return false;
-      // VCVTUDQ2PS or VCVTUDQ2PD
-      return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
+  case ISD::SINT_TO_FP:
+    // TODO: Handle wider types with AVX/AVX512.
+    if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
+      return false;
+    // CVTDQ2PS or (V)CVTDQ2PD
+    return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
 
-    default:
+  case ISD::UINT_TO_FP:
+    // TODO: Handle wider types and i64 elements.
+    if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
       return false;
+    // VCVTUDQ2PS or VCVTUDQ2PD
+    return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
+
+  default:
+    return false;
   }
 }
 
@@ -19948,7 +19943,7 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL,
     return SDValue();
 
   SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
-  SDValue One  = DAG.getConstant(1, DL, MVT::v4i64);
+  SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
   SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
                              DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
                              DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
@@ -20166,7 +20161,7 @@ std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
     Chain = Result.getValue(1);
   }
 
-  return { Result, Chain };
+  return {Result, Chain};
 }
 
 /// Horizontal vector math instructions may be slower than normal math with
@@ -20203,18 +20198,18 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl,
   LLVMContext *Context = DAG.getContext();
 
   // Build some magic constants.
-  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+  static const uint32_t CV0[] = {0x43300000, 0x45300000, 0, 0};
   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
 
-  SmallVector<Constant*,2> CV1;
+  SmallVector<Constant *, 2> CV1;
   CV1.push_back(
-    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
-                                      APInt(64, 0x4330000000000000ULL))));
+      ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
+                                        APInt(64, 0x4330000000000000ULL))));
   CV1.push_back(
-    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
-                                      APInt(64, 0x4530000000000000ULL))));
+      ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
+                                        APInt(64, 0x4530000000000000ULL))));
   Constant *C1 = ConstantVector::get(CV1);
   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
 
@@ -20235,11 +20230,10 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl,
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   SDValue Result;
 
-  if (Subtarget.hasSSE3() &&
-      shouldUseHorizontalOp(true, DAG, Subtarget)) {
+  if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) {
     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   } else {
-    SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
+    SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1, -1});
     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
   }
   Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
@@ -20265,8 +20259,7 @@ static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl,
 
   // Or the load with the bias.
   SDValue Or = DAG.getNode(
-      ISD::OR, dl, MVT::v2i64,
-      DAG.getBitcast(MVT::v2i64, Load),
+      ISD::OR, dl, MVT::v2i64, DAG.getBitcast(MVT::v2i64, Load),
       DAG.getBitcast(MVT::v2i64,
                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
@@ -20464,8 +20457,9 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
     // Low will be bitcasted right away, so do not bother bitcasting back to its
     // original type.
-    Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
-                      VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
+    Low =
+        DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, VecCstLowBitcast,
+                    DAG.getTargetConstant(0xaa, DL, MVT::i8));
     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
     //                                 (uint4) 0x53000000, 0xaa);
     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
@@ -20473,7 +20467,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
     // High will be bitcasted right away, so do not bother bitcasting back to
     // its original type.
     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
-                       VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
+                       VecCstHighBitcast,
+                       DAG.getTargetConstant(0xaa, DL, MVT::i8));
   } else {
     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
@@ -20509,7 +20504,8 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
 }
 
-static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG,
+static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl,
+                                   SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
   SDValue N0 = Op.getOperand(OpNo);
@@ -20720,8 +20716,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     DstTy = MVT::i64;
   }
 
-  assert(DstTy.getSimpleVT() <= MVT::i64 &&
-         DstTy.getSimpleVT() >= MVT::i16 &&
+  assert(DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 &&
          "Unknown FP_TO_INT to lower!");
 
   // We lower FP->int64 into FISTP64 followed by a load from a temporary
@@ -20759,8 +20754,8 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     bool LosesInfo = false;
     if (TheVT == MVT::f64)
       // The rounding mode is irrelevant as the conversion should be exact.
-      Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
-                              &LosesInfo);
+      Status = Thresh.convert(APFloat::IEEEdouble(),
+                              APFloat::rmNearestTiesToEven, &LosesInfo);
     else if (TheVT == MVT::f80)
       Status = Thresh.convert(APFloat::x87DoubleExtended(),
                               APFloat::rmNearestTiesToEven, &LosesInfo);
@@ -20770,8 +20765,8 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
 
     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
 
-    EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
-                                   *DAG.getContext(), TheVT);
+    EVT ResVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), TheVT);
     SDValue Cmp;
     if (IsStrict) {
       Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
@@ -20800,8 +20795,8 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
                                    DAG.getConstantFP(0.0, DL, TheVT));
 
     if (IsStrict) {
-      Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
-                          { Chain, Value, FltOfs });
+      Value = DAG.getNode(ISD::STRICT_FSUB, DL, {TheVT, MVT::Other},
+                          {Chain, Value, FltOfs});
       Chain = Value.getValue(1);
     } else
       Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
@@ -20815,7 +20810,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
     Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
-    SDValue Ops[] = { Chain, StackSlot };
+    SDValue Ops[] = {Chain, StackSlot};
 
     unsigned FLDSize = TheVT.getStoreSize();
     assert(FLDSize <= MemSize && "Stack slot not big enough");
@@ -20828,10 +20823,9 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   // Build the FP_TO_INT*_IN_MEM
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
-  SDValue Ops[] = { Chain, Value, StackSlot };
-  SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
-                                         DAG.getVTList(MVT::Other),
-                                         Ops, DstTy, MMO);
+  SDValue Ops[] = {Chain, Value, StackSlot};
+  SDValue FIST = DAG.getMemIntrinsicNode(
+      X86ISD::FP_TO_INT_IN_MEM, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO);
 
   SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
   Chain = Res.getValue(1);
@@ -21010,7 +21004,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
     return In;
 
   unsigned NumElems = SrcVT.getVectorNumElements();
-  if (NumElems < 2 || !isPowerOf2_32(NumElems) )
+  if (NumElems < 2 || !isPowerOf2_32(NumElems))
     return SDValue();
 
   unsigned DstSizeInBits = DstVT.getSizeInBits();
@@ -21081,7 +21075,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
     // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
     SmallVector<int, 64> Mask;
     int Scale = 64 / OutVT.getScalarSizeInBits();
-    narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
+    narrowShuffleMaskElts(Scale, {0, 2, 1, 3}, Mask);
     Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
 
     if (DstVT.is256BitVector())
@@ -21325,14 +21319,12 @@ static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL,
       if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
         // We need to shift to get the lsb into sign position.
         // Shift packed bytes not supported natively, bitcast to word
-        MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
-        In = DAG.getNode(ISD::SHL, DL, ExtVT,
-                         DAG.getBitcast(ExtVT, In),
+        MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits() / 16);
+        In = DAG.getNode(ISD::SHL, DL, ExtVT, DAG.getBitcast(ExtVT, In),
                          DAG.getConstant(ShiftInx, DL, ExtVT));
         In = DAG.getBitcast(InVT, In);
       }
-      return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
-                          In, ISD::SETGT);
+      return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
     }
     // Use TESTD/Q, extended vector to packed dword/qword.
     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
@@ -21370,7 +21362,8 @@ static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL,
     // We either have 8 elements or we're allowed to use 512-bit vectors.
     // If we have VLX, we want to use the narrowest vector that can get the
     // job done so we use vXi32.
-    MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
+    MVT EltVT =
+        Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512 / NumElts);
     MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
     InVT = ExtVT;
@@ -21484,10 +21477,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
     if (Subtarget.hasInt256()) {
       // The PSHUFB mask:
-      static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
-                                      -1, -1, -1, -1, -1, -1, -1, -1,
-                                      16, 17, 20, 21, 24, 25, 28, 29,
-                                      -1, -1, -1, -1, -1, -1, -1, -1 };
+      static const int ShufMask1[] = {
+          0,  1,  4,  5,  8,  9,  12, 13, -1, -1, -1, -1, -1, -1, -1, -1,
+          16, 17, 20, 21, 24, 25, 28, 29, -1, -1, -1, -1, -1, -1, -1, -1};
       In = DAG.getBitcast(MVT::v32i8, In);
       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
       In = DAG.getBitcast(MVT::v4i64, In);
@@ -21665,8 +21657,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
                           dl, {NVT, MVT::Other}, {Chain, Src});
         Chain = Res.getValue(1);
       } else {
-        Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
-                          NVT, Src);
+        Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, NVT,
+                          Src);
       }
 
       // TODO: Need to add exception check code for strict FP.
@@ -21768,8 +21760,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                 DAG.getUNDEF(MVT::v2f32));
       if (IsStrict) {
-        unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
-                                : X86ISD::STRICT_CVTTP2UI;
+        unsigned Opc =
+            IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
         return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
       }
       unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
@@ -21894,7 +21886,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
         makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
 
     if (IsStrict)
-      return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+      return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
 
     return Tmp.first;
   }
@@ -21957,7 +21949,7 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
     assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
     Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
     SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
-    SDValue Ops[] = { Chain, StackPtr };
+    SDValue Ops[] = {Chain, StackPtr};
 
     Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
                                   /*Align*/ std::nullopt,
@@ -21965,7 +21957,7 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
     Chain = Src.getValue(1);
   }
 
-  SDValue StoreOps[] = { Chain, Src, StackPtr };
+  SDValue StoreOps[] = {Chain, Src, StackPtr};
   Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
                                   StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
                                   MachineMemOperand::MOStore);
@@ -21973,8 +21965,8 @@ SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
   return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
 }
 
-SDValue
-X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
+                                              SelectionDAG &DAG) const {
   // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
   // but making use of X86 specifics to produce better instruction sequences.
   SDNode *Node = Op.getNode();
@@ -22036,12 +22028,12 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
   APFloat MinFloat(Sem);
   APFloat MaxFloat(Sem);
 
-  APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
-    MinInt, IsSigned, APFloat::rmTowardZero);
-  APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
-    MaxInt, IsSigned, APFloat::rmTowardZero);
-  bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
-                          && !(MaxStatus & APFloat::opStatus::opInexact);
+  APFloat::opStatus MinStatus =
+      MinFloat.convertFromAPInt(MinInt, IsSigned, APFloat::rmTowardZero);
+  APFloat::opStatus MaxStatus =
+      MaxFloat.convertFromAPInt(MaxInt, IsSigned, APFloat::rmTowardZero);
+  bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
+                             !(MaxStatus & APFloat::opStatus::opInexact);
 
   SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
   SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
@@ -22051,11 +22043,11 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
   if (AreExactFloatBounds) {
     if (DstVT != TmpVT) {
       // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
-      SDValue MinClamped = DAG.getNode(
-        X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
+      SDValue MinClamped =
+          DAG.getNode(X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
       // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
-      SDValue BothClamped = DAG.getNode(
-        X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
+      SDValue BothClamped =
+          DAG.getNode(X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
       // Convert clamped value to integer.
       SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
 
@@ -22065,11 +22057,11 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
     }
 
     // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
-    SDValue MinClamped = DAG.getNode(
-      X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
+    SDValue MinClamped =
+        DAG.getNode(X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
     // Clamp by MaxFloat from above. NaN cannot occur.
-    SDValue BothClamped = DAG.getNode(
-      X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
+    SDValue BothClamped =
+        DAG.getNode(X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
     // Convert clamped value to integer.
     SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
 
@@ -22081,8 +22073,8 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
 
     // Otherwise, select zero if Src is NaN.
     SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
-    return DAG.getSelectCC(
-      dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
+    return DAG.getSelectCC(dl, Src, Src, ZeroInt, FpToInt,
+                           ISD::CondCode::SETUO);
   }
 
   SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
@@ -22104,13 +22096,13 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
   if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
     // If Src ULT MinFloat, select MinInt. In particular, this also selects
     // MinInt if Src is NaN.
-    Select = DAG.getSelectCC(
-      dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
+    Select = DAG.getSelectCC(dl, Src, MinFloatNode, MinIntNode, Select,
+                             ISD::CondCode::SETULT);
   }
 
   // If Src OGT MaxFloat, select MaxInt.
-  Select = DAG.getSelectCC(
-    dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
+  Select = DAG.getSelectCC(dl, Src, MaxFloatNode, MaxIntNode, Select,
+                           ISD::CondCode::SETOGT);
 
   // In the unsigned case we are done, because we mapped NaN to MinInt, which
   // is already zero. The promoted case was already handled above.
@@ -22120,8 +22112,7 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
 
   // Otherwise, select 0 if Src is NaN.
   SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
-  return DAG.getSelectCC(
-    dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
+  return DAG.getSelectCC(dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
 }
 
 SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
@@ -22177,15 +22168,15 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
       Entry.IsZExt = true;
       Args.push_back(Entry);
 
-      SDValue Callee = DAG.getExternalSymbol(
-          getLibcallName(RTLIB::FPEXT_F16_F32),
-          getPointerTy(DAG.getDataLayout()));
+      SDValue Callee =
+          DAG.getExternalSymbol(getLibcallName(RTLIB::FPEXT_F16_F32),
+                                getPointerTy(DAG.getDataLayout()));
       CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
           CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
           std::move(Args));
 
       SDValue Res;
-      std::tie(Res,Chain) = LowerCallTo(CLI);
+      std::tie(Res, Chain) = LowerCallTo(CLI);
       if (IsStrict)
         Res = DAG.getMergeValues({Res, Chain}, DL);
 
@@ -22453,14 +22444,14 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL,
   // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
   unsigned HOpcode;
   switch (Op.getOpcode()) {
-  // clang-format off
+    // clang-format off
   case ISD::ADD: HOpcode = X86ISD::HADD; break;
   case ISD::SUB: HOpcode = X86ISD::HSUB; break;
   case ISD::FADD: HOpcode = X86ISD::FHADD; break;
   case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
   default:
     llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
-  // clang-format on
+    // clang-format on
   }
   unsigned LExtIndex = LHS.getConstantOperandVal(1);
   unsigned RExtIndex = RHS.getConstantOperandVal(1);
@@ -22518,7 +22509,7 @@ static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
   bool Ignored;
   APFloat Point5Pred = APFloat(0.5f);
   Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
-  Point5Pred.next(/*nextDown*/true);
+  Point5Pred.next(/*nextDown*/ true);
 
   SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
                               DAG.getConstantFP(Point5Pred, dl, VT), N0);
@@ -22568,16 +22559,16 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
 
   unsigned EltBits = VT.getScalarSizeInBits();
   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
-  APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
-                           APInt::getSignMask(EltBits);
+  APInt MaskElt =
+      IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
   const fltSemantics &Sem = VT.getFltSemantics();
   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
 
   SDValue Op0 = Op.getOperand(0);
   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
-  unsigned LogicOp = IsFABS  ? X86ISD::FAND :
-                     IsFNABS ? X86ISD::FOR  :
-                               X86ISD::FXOR;
+  unsigned LogicOp = IsFABS    ? X86ISD::FAND
+                     : IsFNABS ? X86ISD::FOR
+                               : X86ISD::FXOR;
   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
 
   if (VT.isVector() || IsF128)
@@ -22680,7 +22671,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
 }
 
 /// Helper for attempting to create a X86ISD::BT node.
-static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
+static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL,
+                     SelectionDAG &DAG) {
   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
   // instruction.  Since the shift amount is in-range-or-undefined, we know
   // that doing a bittest on the i32 value is ok.  We extend to i32 because
@@ -23285,8 +23277,7 @@ static bool hasNonFlagsUse(SDValue Op) {
 // the node alone and emit a 'cmp' or 'test' instruction.
 static bool isProfitableToUseFlagOp(SDValue Op) {
   for (SDNode *U : Op->users())
-    if (U->getOpcode() != ISD::CopyToReg &&
-        U->getOpcode() != ISD::SETCC &&
+    if (U->getOpcode() != ISD::CopyToReg && U->getOpcode() != ISD::SETCC &&
         U->getOpcode() != ISD::STORE)
       return false;
 
@@ -23302,14 +23293,20 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
   bool NeedCF = false;
   bool NeedOF = false;
   switch (X86CC) {
-  default: break;
-  case X86::COND_A: case X86::COND_AE:
-  case X86::COND_B: case X86::COND_BE:
+  default:
+    break;
+  case X86::COND_A:
+  case X86::COND_AE:
+  case X86::COND_B:
+  case X86::COND_BE:
     NeedCF = true;
     break;
-  case X86::COND_G: case X86::COND_GE:
-  case X86::COND_L: case X86::COND_LE:
-  case X86::COND_O: case X86::COND_NO: {
+  case X86::COND_G:
+  case X86::COND_GE:
+  case X86::COND_L:
+  case X86::COND_LE:
+  case X86::COND_O:
+  case X86::COND_NO: {
     // Check if we really need to set the
     // Overflow flag. If NoSignedWrap is present
     // that is not actually needed.
@@ -23361,14 +23358,14 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
 
     // Otherwise use a regular EFLAGS-setting instruction.
     switch (ArithOp.getOpcode()) {
-    // clang-format off
+      // clang-format off
     default: llvm_unreachable("unexpected operator!");
     case ISD::ADD: Opcode = X86ISD::ADD; break;
     case ISD::SUB: Opcode = X86ISD::SUB; break;
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
     case ISD::OR:  Opcode = X86ISD::OR;  break;
-    // clang-format on
+      // clang-format on
     }
 
     NumOperands = 2;
@@ -23383,8 +23380,9 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl,
   case ISD::USUBO: {
     // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-    return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
-                       Op->getOperand(1)).getValue(1);
+    return DAG
+        .getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0), Op->getOperand(1))
+        .getValue(1);
   }
   default:
     break;
@@ -23413,8 +23411,9 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, X86::CondCode X86CC,
 
   EVT CmpVT = Op0.getValueType();
 
-  assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
-          CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
+  assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 ||
+          CmpVT == MVT::i64) &&
+         "Unexpected VT!");
 
   // Only promote the compare up to I32 if it is a 16 bit operation
   // with an immediate. 16 bit immediates are to be avoided unless the target
@@ -23532,9 +23531,8 @@ bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
 
 /// The minimum architected relative accuracy is 2^-12. We need one
 /// Newton-Raphson step to have a good float result (24 bits of precision).
-SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
-                                           SelectionDAG &DAG, int Enabled,
-                                           int &RefinementSteps,
+SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, SelectionDAG &DAG,
+                                           int Enabled, int &RefinementSteps,
                                            bool &UseOneConstNR,
                                            bool Reciprocal) const {
   SDLoc DL(Op);
@@ -23641,9 +23639,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
 /// This is because we still need one division to calculate the reciprocal and
 /// then we need two multiplies by that reciprocal as replacements for the
 /// original divisions.
-unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
-  return 2;
-}
+unsigned X86TargetLowering::combineRepeatedFPDivisors() const { return 2; }
 
 SDValue
 X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
@@ -23651,7 +23647,7 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                  SmallVectorImpl<SDNode *> &Created) const {
   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
   if (isIntDivCheap(N->getValueType(0), Attr))
-    return SDValue(N,0); // Lower SDIV as SDIV
+    return SDValue(N, 0); // Lower SDIV as SDIV
 
   assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
          "Unexpected divisor!");
@@ -23720,8 +23716,8 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
       if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
           isPowerOf2_64(AndRHSVal)) {
         Src = AndLHS;
-        BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
-                                Src.getValueType());
+        BitNo =
+            DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, Src.getValueType());
       }
     }
   }
@@ -23767,7 +23763,7 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
   //  6 - NLE
   //  7 - ORD
   switch (SetCCOpcode) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETOEQ:
   case ISD::SETEQ:  SSECC = 0; break;
@@ -23789,7 +23785,7 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
   case ISD::SETO:   SSECC = 7; break;
   case ISD::SETUEQ: SSECC = 8; break;
   case ISD::SETONE: SSECC = 12; break;
-  // clang-format on
+    // clang-format on
   }
   if (Swap)
     std::swap(Op0, Op1);
@@ -24074,13 +24070,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
           Cmp1 = DAG.getNode(
               Opc, dl, {VT, MVT::Other},
               {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
-          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
-                              Cmp1.getValue(1));
+          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                              Cmp0.getValue(1), Cmp1.getValue(1));
         } else {
-          Cmp0 = DAG.getNode(
-              Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
-          Cmp1 = DAG.getNode(
-              Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
+          Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                             DAG.getTargetConstant(CC0, dl, MVT::i8));
+          Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                             DAG.getTargetConstant(CC1, dl, MVT::i8));
         }
         Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
       } else {
@@ -24090,8 +24086,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
               {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
           Chain = Cmp.getValue(1);
         } else
-          Cmp = DAG.getNode(
-              Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
+          Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                            DAG.getTargetConstant(SSECC, dl, MVT::i8));
       }
     } else {
       // Handle all other FP comparisons here.
@@ -24103,8 +24099,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
             {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
         Chain = Cmp.getValue(1);
       } else
-        Cmp = DAG.getNode(
-            Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
+        Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
+                          DAG.getTargetConstant(SSECC, dl, MVT::i8));
     }
 
     if (VT.getFixedSizeInBits() >
@@ -24155,7 +24151,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     // Translate compare code to XOP PCOM compare mode.
     unsigned CmpMode = 0;
     switch (Cond) {
-    // clang-format off
+      // clang-format off
     default: llvm_unreachable("Unexpected SETCC condition");
     case ISD::SETULT:
     case ISD::SETLT: CmpMode = 0x00; break;
@@ -24167,7 +24163,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     case ISD::SETGE: CmpMode = 0x03; break;
     case ISD::SETEQ: CmpMode = 0x04; break;
     case ISD::SETNE: CmpMode = 0x05; break;
-    // clang-format on
+      // clang-format on
     }
 
     // Are we comparing unsigned or signed integers?
@@ -24265,13 +24261,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     bool Invert = false;
     unsigned Opc;
     switch (Cond) {
-    // clang-format off
+      // clang-format off
     default: llvm_unreachable("Unexpected condition code");
     case ISD::SETUGT: Invert = true; [[fallthrough]];
     case ISD::SETULE: Opc = ISD::UMIN; break;
     case ISD::SETULT: Invert = true; [[fallthrough]];
     case ISD::SETUGE: Opc = ISD::UMAX; break;
-    // clang-format on
+      // clang-format on
     }
 
     SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
@@ -24295,10 +24291,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   // operations may be required for some comparisons.
   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
                                                             : X86ISD::PCMPGT;
-  bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
-              Cond == ISD::SETGE || Cond == ISD::SETUGE;
-  bool Invert = Cond == ISD::SETNE ||
-                (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
+  bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || Cond == ISD::SETGE ||
+              Cond == ISD::SETUGE;
+  bool Invert =
+      Cond == ISD::SETNE || (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
 
   if (Swap)
     std::swap(Op0, Op1);
@@ -24316,7 +24312,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
         Op1 = DAG.getBitcast(MVT::v4i32, Op1);
 
         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
-        static const int MaskHi[] = { 1, 1, 3, 3 };
+        static const int MaskHi[] = {1, 1, 3, 3};
         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
 
         return DAG.getBitcast(VT, Result);
@@ -24327,7 +24323,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
         Op1 = DAG.getAllOnesConstant(dl, MVT::v4i32);
 
         SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
-        static const int MaskHi[] = { 1, 1, 3, 3 };
+        static const int MaskHi[] = {1, 1, 3, 3};
         SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
 
         return DAG.getBitcast(VT, Result);
@@ -24366,8 +24362,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
 
       // Create masks for only the low parts/high parts of the 64 bit integers.
-      static const int MaskHi[] = { 1, 1, 3, 3 };
-      static const int MaskLo[] = { 0, 0, 2, 2 };
+      static const int MaskHi[] = {1, 1, 3, 3};
+      static const int MaskLo[] = {0, 0, 2, 2};
       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
@@ -24394,7 +24390,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
 
       // Make sure the lower and upper halves are both all-ones.
-      static const int Mask[] = { 1, 0, 3, 2 };
+      static const int Mask[] = {1, 0, 3, 2};
       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
 
@@ -24409,8 +24405,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   // bits of the inputs before performing those operations.
   if (FlipSigns) {
     MVT EltVT = VT.getVectorElementType();
-    SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
-                                 VT);
+    SDValue SM =
+        DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl, VT);
     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
   }
@@ -24427,8 +24423,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
 // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
 static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
                               const SDLoc &dl, SelectionDAG &DAG,
-                              const X86Subtarget &Subtarget,
-                              SDValue &X86CC) {
+                              const X86Subtarget &Subtarget, SDValue &X86CC) {
   assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
 
   // Must be a bitcast from vXi1.
@@ -24575,7 +24570,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
                   Op.getOpcode() == ISD::STRICT_FSETCCS;
   MVT VT = Op->getSimpleValueType(0);
 
-  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+  if (VT.isVector())
+    return LowerVSETCC(Op, Subtarget, DAG);
 
   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
@@ -24670,7 +24666,8 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
 }
 
-SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op,
+                                           SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue Carry = Op.getOperand(2);
@@ -24682,8 +24679,8 @@ SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const
 
   // Recreate the carry if needed.
   EVT CarryVT = Carry.getValueType();
-  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
-                      Carry, DAG.getAllOnesConstant(DL, CarryVT));
+  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry,
+                      DAG.getAllOnesConstant(DL, CarryVT));
 
   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
@@ -24703,7 +24700,8 @@ getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
   unsigned BaseOp = 0;
   SDLoc DL(Op);
   switch (Op.getOpcode()) {
-  default: llvm_unreachable("Unknown ovf instruction!");
+  default:
+    llvm_unreachable("Unknown ovf instruction!");
   case ISD::SADDO:
     BaseOp = X86ISD::ADD;
     Cond = X86::COND_O;
@@ -24777,7 +24775,8 @@ static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   SDValue VOp0 = V.getOperand(0);
   unsigned InBits = VOp0.getValueSizeInBits();
   unsigned Bits = V.getValueSizeInBits();
-  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+  return DAG.MaskedValueIsZero(VOp0,
+                               APInt::getHighBitsSet(InBits, InBits - Bits));
 }
 
 // Lower various (select (icmp CmpVal, 0), LHS, RHS) custom patterns.
@@ -24915,7 +24914,7 @@ static SDValue LowerSELECTWithCmpZero(SDValue CmpVal, SDValue LHS, SDValue RHS,
 
 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   bool AddTest = true;
-  SDValue Cond  = Op.getOperand(0);
+  SDValue Cond = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue Op2 = Op.getOperand(2);
   SDLoc DL(Op);
@@ -25066,14 +25065,13 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
   unsigned CondOpcode = Cond.getOpcode();
-  if (CondOpcode == X86ISD::SETCC ||
-      CondOpcode == X86ISD::SETCC_CARRY) {
+  if (CondOpcode == X86ISD::SETCC || CondOpcode == X86ISD::SETCC_CARRY) {
     CC = Cond.getOperand(0);
 
     SDValue Cmp = Cond.getOperand(1);
     bool IllegalFPCMov = false;
-    if (VT.isFloatingPoint() && !VT.isVector() &&
-        !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV())  // FPStack?
+    if (VT.isFloatingPoint() && !VT.isVector() && !isScalarFPTypeInSSEReg(VT) &&
+        Subtarget.canUseCMOV()) // FPStack?
       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
 
     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
@@ -25136,14 +25134,15 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
   // widen the cmov and push the truncate through. This avoids introducing a new
   // branch during isel and doesn't add any extensions.
-  if (Op.getValueType() == MVT::i8 &&
-      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+  if (Op.getValueType() == MVT::i8 && Op1.getOpcode() == ISD::TRUNCATE &&
+      Op2.getOpcode() == ISD::TRUNCATE) {
     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
     if (T1.getValueType() == T2.getValueType() &&
         // Exclude CopyFromReg to avoid partial register stalls.
-        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
-      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
-                                 CC, Cond);
+        T1.getOpcode() != ISD::CopyFromReg &&
+        T2.getOpcode() != ISD::CopyFromReg) {
+      SDValue Cmov =
+          DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1, CC, Cond);
       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
     }
   }
@@ -25159,14 +25158,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
        !X86::mayFoldLoad(Op2, Subtarget))) {
     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
-    SDValue Ops[] = { Op2, Op1, CC, Cond };
+    SDValue Ops[] = {Op2, Op1, CC, Cond};
     SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
     return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
   }
 
   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   // condition is true.
-  SDValue Ops[] = { Op2, Op1, CC, Cond };
+  SDValue Ops[] = {Op2, Op1, CC, Cond};
   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
 }
 
@@ -25276,9 +25275,9 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
     InVT = In.getSimpleValueType();
   }
 
-  // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
-  // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
-  // need to be handled here for 256/512-bit results.
+  // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit
+  // results, so are legal and shouldn't occur here. AVX2/AVX512 pmovsx*
+  // instructions still need to be handled here for 256/512-bit results.
   if (Subtarget.hasInt256()) {
     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
 
@@ -25287,9 +25286,8 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
 
     // FIXME: Apparently we create inreg operations that could be regular
     // extends.
-    unsigned ExtOpc =
-        Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
-                                             : ISD::ZERO_EXTEND;
+    unsigned ExtOpc = Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
+                                                           : ISD::ZERO_EXTEND;
     return DAG.getNode(ExtOpc, dl, VT, In);
   }
 
@@ -25407,9 +25405,9 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
 
   unsigned NumElems = InVT.getVectorNumElements();
-  SmallVector<int,8> ShufMask(NumElems, -1);
-  for (unsigned i = 0; i != NumElems/2; ++i)
-    ShufMask[i] = i + NumElems/2;
+  SmallVector<int, 8> ShufMask(NumElems, -1);
+  for (unsigned i = 0; i != NumElems / 2; ++i)
+    ShufMask[i] = i + NumElems / 2;
 
   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
   OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
@@ -25573,11 +25571,10 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
 // TODO: It is possible to support ZExt by zeroing the undef values during
 // the shuffle phase or after the shuffle.
 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
-                                 SelectionDAG &DAG) {
+                         SelectionDAG &DAG) {
   MVT RegVT = Op.getSimpleValueType();
   assert(RegVT.isVector() && "We only custom lower vector loads.");
-  assert(RegVT.isInteger() &&
-         "We only custom lower integer vector loads.");
+  assert(RegVT.isInteger() && "We only custom lower integer vector loads.");
 
   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   SDLoc dl(Ld);
@@ -25620,8 +25617,8 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
 
 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
-  SDValue Cond  = Op.getOperand(1);
-  SDValue Dest  = Op.getOperand(2);
+  SDValue Cond = Op.getOperand(1);
+  SDValue Dest = Op.getOperand(2);
   SDLoc dl(Op);
 
   // Bail out when we don't have native compare instructions.
@@ -25671,7 +25668,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
         if (User->getOpcode() == ISD::BR) {
           SDValue FalseBB = User->getOperand(1);
           SDNode *NewBR =
-            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
           assert(NewBR == User);
           (void)NewBR;
           Dest = FalseBB;
@@ -25742,9 +25739,8 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
 // that the guard pages used by the OS virtual memory manager are allocated in
 // correct sequence.
-SDValue
-X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
-                                           SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                   SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool SplitStack = MF.shouldSplitStack();
   bool EmitStackProbeCall = hasStackProbeSymbol(MF);
@@ -25755,7 +25751,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   // Get the inputs.
   SDNode *Node = Op.getNode();
   SDValue Chain = Op.getOperand(0);
-  SDValue Size  = Op.getOperand(1);
+  SDValue Size = Op.getOperand(1);
   MaybeAlign Alignment(Op.getConstantOperandVal(2));
   EVT VT = Node->getValueType(0);
 
@@ -25877,8 +25873,9 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MemOps.push_back(Store);
 
   // Store ptr to reg_save_area.
-  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
-      Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
+  FIN = DAG.getNode(
+      ISD::ADD, DL, PtrVT, FIN,
+      DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
   Store = DAG.getStore(
       Op.getOperand(0), DL, RSFIN, FIN,
@@ -25888,8 +25885,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
-  assert(Subtarget.is64Bit() &&
-         "LowerVAARG only handles 64-bit va_arg!");
+  assert(Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!");
   assert(Op.getNumOperands() == 4);
 
   MachineFunction &MF = DAG.getMachineFunction();
@@ -25913,11 +25909,11 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   // selection mechanism works only for the basic types.
   assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
   if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
-    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
+    ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
   } else {
     assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
            "Unhandled argument type in LowerVAARG");
-    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
+    ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
   }
 
   if (ArgMode == 2) {
@@ -25951,7 +25947,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
   // where a va_list is still an i8*.
   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
   if (Subtarget.isCallingConvWin64(
-        DAG.getMachineFunction().getFunction().getCallingConv()))
+          DAG.getMachineFunction().getFunction().getCallingConv()))
     // Probably a Win64 va_copy.
     return DAG.expandVACopy(Op.getNode());
 
@@ -26013,15 +26009,17 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
       return DAG.getConstant(0, dl, VT);
   }
 
-  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
-         && "Unknown target vector shift-by-constant node");
+  assert(
+      (Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) &&
+      "Unknown target vector shift-by-constant node");
 
   // Fold this packed vector shift into a build vector if SrcOp is a
   // vector of Constants or UNDEFs.
   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
     unsigned ShiftOpc;
     switch (Opc) {
-    default: llvm_unreachable("Unknown opcode!");
+    default:
+      llvm_unreachable("Unknown opcode!");
     case X86ISD::VSHLI:
       ShiftOpc = ISD::SHL;
       break;
@@ -26161,8 +26159,8 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
     Hi = DAG.getBitcast(MVT::v32i1, Hi);
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
   } else {
-    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                     Mask.getSimpleValueType().getSizeInBits());
+    MVT BitcastVT =
+        MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits());
     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
     // are extracted by EXTRACT_SUBVECTOR.
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
@@ -26233,9 +26231,12 @@ static int getSEHRegistrationNodeSize(const Function *Fn) {
   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
   // WinEHStatePass for the full struct definition.
   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
-  case EHPersonality::MSVC_X86SEH: return 24;
-  case EHPersonality::MSVC_CXX: return 16;
-  default: break;
+  case EHPersonality::MSVC_X86SEH:
+    return 24;
+  case EHPersonality::MSVC_CXX:
+    return 16;
+  default:
+    break;
   }
   report_fatal_error(
       "can only recover FP for 32-bit MSVC EH personality functions");
@@ -26327,13 +26328,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   SDLoc dl(Op);
   unsigned IntNo = Op.getConstantOperandVal(0);
   MVT VT = Op.getSimpleValueType();
-  const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
+  const IntrinsicData *IntrData = getIntrinsicWithoutChain(IntNo);
 
   // Propagate flags from original node to transformed node(s).
   SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
 
   if (IntrData) {
-    switch(IntrData->Type) {
+    switch (IntrData->Type) {
     case INTR_TYPE_1OP: {
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
@@ -26459,9 +26460,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         if (!isRoundModeCurDirection(Rnd))
           return SDValue();
       }
-      return getVectorMaskingNode(
-          DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
-          Subtarget, DAG);
+      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
+                                  Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_1OP_MASK_SAE: {
       SDValue Src = Op.getOperand(1);
@@ -26502,9 +26502,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
           if (!isRoundModeCurDirection(Rnd))
             return SDValue();
         }
-        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
-                                                Src2),
-                                    Mask, passThru, Subtarget, DAG);
+        return getScalarMaskingNode(
+            DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), Mask, passThru,
+            Subtarget, DAG);
       }
 
       assert(Op.getNumOperands() == (6U + HasRounding) &&
@@ -26518,9 +26518,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         else if (!isRoundModeCurDirection(Sae))
           return SDValue();
       }
-      return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
-                                              Src2, RoundingMode),
-                                  Mask, passThru, Subtarget, DAG);
+      return getScalarMaskingNode(
+          DAG.getNode(Opc, dl, VT, Src1, Src2, RoundingMode), Mask, passThru,
+          Subtarget, DAG);
     }
     case INTR_TYPE_SCALAR_MASK_RND: {
       SDValue Src1 = Op.getOperand(1);
@@ -26555,8 +26555,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       else
         return SDValue();
 
-      return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
-                                  Mask, passThru, Subtarget, DAG);
+      return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask,
+                                  passThru, Subtarget, DAG);
     }
     case INTR_TYPE_2OP_MASK: {
       SDValue Src1 = Op.getOperand(1);
@@ -26592,8 +26592,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
           return SDValue();
       }
 
-      return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
-                                  Mask, PassThru, Subtarget, DAG);
+      return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), Mask,
+                                  PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_3OP_SCALAR_MASK_SAE: {
       SDValue Src1 = Op.getOperand(1);
@@ -26642,12 +26642,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       // Reverse the operands to match VSELECT order.
       return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
     }
-    case VPERM_2OP : {
+    case VPERM_2OP: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
 
       // Swap Src1 and Src2 in the node creation
-      return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
+      return DAG.getNode(IntrData->Opc0, dl, VT, Src2, Src1);
     }
     case CFMA_OP_MASKZ:
     case CFMA_OP_MASK: {
@@ -26691,8 +26691,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       SDValue Imm = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
-      SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
-                                                 Subtarget, DAG);
+      SDValue FPclassMask =
+          getScalarMaskingNode(FPclass, Mask, SDValue(), Subtarget, DAG);
       // Need to fill with zeros to ensure the bitcast will produce zeroes
       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
@@ -26716,7 +26716,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         if (!isRoundModeCurDirection(Sae))
           return SDValue();
       }
-      //default rounding mode
+      // default rounding mode
       return DAG.getNode(IntrData->Opc0, dl, MaskVT,
                          {Op.getOperand(1), Op.getOperand(2), CC, Mask});
     }
@@ -26734,12 +26734,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         else if (!isRoundModeCurDirection(Sae))
           return SDValue();
       }
-      //default rounding mode
+      // default rounding mode
       if (!Cmp.getNode())
         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
 
-      SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
-                                             Subtarget, DAG);
+      SDValue CmpMask =
+          getScalarMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG);
       // Need to fill with zeros to ensure the bitcast will produce zeroes
       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
@@ -26907,8 +26907,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
 
       uint64_t Imm = Op.getConstantOperandVal(2);
-      SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
-                                              Op.getValueType());
+      SDValue Control =
+          DAG.getTargetConstant(Imm & 0xffff, dl, Op.getValueType());
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          Op.getOperand(1), Control);
     }
@@ -26930,7 +26930,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                           Op.getOperand(3), GenCF.getValue(1));
       }
       SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
-      SDValue Results[] = { SetCC, Res };
+      SDValue Results[] = {SetCC, Res};
       return DAG.getMergeValues(Results, dl);
     }
     case CVTPD2PS_MASK:
@@ -27013,7 +27013,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
 
   switch (IntNo) {
-  default: return SDValue();    // Don't custom lower most intrinsics.
+  default:
+    return SDValue(); // Don't custom lower most intrinsics.
 
   // ptest and testp intrinsics. The intrinsic these come from are designed to
   // return an integer value, not just an instruction so lower it to the ptest
@@ -27047,7 +27048,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     unsigned TestOpc = X86ISD::PTEST;
     X86::CondCode X86CC;
     switch (IntNo) {
-    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+    default:
+      llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
     case Intrinsic::x86_avx512_ktestc_b:
     case Intrinsic::x86_avx512_ktestc_w:
     case Intrinsic::x86_avx512_ktestc_d:
@@ -27118,7 +27120,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     unsigned Opcode;
     X86::CondCode X86CC;
     switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    default:
+      llvm_unreachable("Impossible intrinsic"); // Can't reach here.
     case Intrinsic::x86_sse42_pcmpistria128:
       Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_A;
@@ -27288,7 +27291,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     unsigned NewIntrinsic;
     switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    default:
+      llvm_unreachable("Impossible intrinsic"); // Can't reach here.
     case Intrinsic::x86_mmx_pslli_w:
       NewIntrinsic = Intrinsic::x86_mmx_psll_w;
       break;
@@ -27365,16 +27369,16 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
 
   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
 
-  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
   SDValue Res =
       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
   return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
 }
 
-static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
-                             SDValue Src, SDValue Mask, SDValue Base,
-                             SDValue Index, SDValue ScaleOp, SDValue Chain,
+static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src,
+                             SDValue Mask, SDValue Base, SDValue Index,
+                             SDValue ScaleOp, SDValue Chain,
                              const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
@@ -27403,7 +27407,7 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
 
   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
 
-  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
+  SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
   SDValue Res =
       DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
                               MemIntr->getMemoryVT(), MemIntr->getMemOperand());
@@ -27411,9 +27415,9 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
 }
 
 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
-                               SDValue Src, SDValue Mask, SDValue Base,
-                               SDValue Index, SDValue ScaleOp, SDValue Chain,
-                               const X86Subtarget &Subtarget) {
+                              SDValue Src, SDValue Mask, SDValue Base,
+                              SDValue Index, SDValue ScaleOp, SDValue Chain,
+                              const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   // Scale must be constant.
@@ -27455,8 +27459,8 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                         TLI.getPointerTy(DAG.getDataLayout()));
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
-  MVT MaskVT =
-    MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+  MVT MaskVT = MVT::getVectorVT(
+      MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
@@ -27472,11 +27476,11 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
 /// expanded intrinsics implicitly defines extra registers (i.e. not just
 /// EDX:EAX).
 static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
-                                        SelectionDAG &DAG,
-                                        unsigned TargetOpcode,
-                                        unsigned SrcReg,
-                                        const X86Subtarget &Subtarget,
-                                        SmallVectorImpl<SDValue> &Results) {
+                                           SelectionDAG &DAG,
+                                           unsigned TargetOpcode,
+                                           unsigned SrcReg,
+                                           const X86Subtarget &Subtarget,
+                                           SmallVectorImpl<SDValue> &Results) {
   SDValue Chain = N->getOperand(0);
   SDValue Glue;
 
@@ -27516,7 +27520,7 @@ static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,
   }
 
   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
-  SDValue Ops[] = { LO, HI };
+  SDValue Ops[] = {LO, HI};
   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   Results.push_back(Pair);
   Results.push_back(Chain);
@@ -27533,9 +27537,9 @@ static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
   // and the EAX register is loaded with the low-order 32 bits.
-  SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
-                                             /* NoRegister */0, Subtarget,
-                                             Results);
+  SDValue Glue =
+      expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
+                                  /* NoRegister */ 0, Subtarget, Results);
   if (Opcode != X86::RDTSCP)
     return;
 
@@ -27593,24 +27597,24 @@ static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
 }
 
 /// Emit Truncating Store with signed or unsigned saturation.
-static SDValue
-EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
-                SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
-                SelectionDAG &DAG) {
+static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL,
+                               SDValue Val, SDValue Ptr, EVT MemVT,
+                               MachineMemOperand *MMO, SelectionDAG &DAG) {
   SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
-  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  SDValue Ops[] = {Chain, Val, Ptr, Undef};
   unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
   return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
 }
 
 /// Emit Masked Truncating Store with signed or unsigned saturation.
 static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
-                                     const SDLoc &DL,
-                      SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
-                      MachineMemOperand *MMO, SelectionDAG &DAG) {
+                                     const SDLoc &DL, SDValue Val, SDValue Ptr,
+                                     SDValue Mask, EVT MemVT,
+                                     MachineMemOperand *MMO,
+                                     SelectionDAG &DAG) {
   SDVTList VTs = DAG.getVTList(MVT::Other);
-  SDValue Ops[] = { Chain, Val, Ptr, Mask };
+  SDValue Ops[] = {Chain, Val, Ptr, Mask};
   unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
   return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
 }
@@ -27678,9 +27682,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       SDLoc dl(Op);
       // Create a WRPKRU node, pass the input to the EAX parameter,  and pass 0
       // to the EDX and ECX parameters.
-      return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
-                         Op.getOperand(0), Op.getOperand(2),
-                         DAG.getConstant(0, dl, MVT::i32),
+      return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other, Op.getOperand(0),
+                         Op.getOperand(2), DAG.getConstant(0, dl, MVT::i32),
                          DAG.getConstant(0, dl, MVT::i32));
     }
     case llvm::Intrinsic::asan_check_memaccess: {
@@ -27711,7 +27714,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       unsigned Opcode;
 
       switch (IntNo) {
-      default: llvm_unreachable("Impossible intrinsic");
+      default:
+        llvm_unreachable("Impossible intrinsic");
       case Intrinsic::x86_umwait:
         Opcode = X86ISD::UMWAIT;
         break;
@@ -27724,9 +27728,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
         break;
       }
 
-      SDValue Operation =
-          DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
-                      Op->getOperand(3), Op->getOperand(4));
+      SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
+                                      Op->getOperand(3), Op->getOperand(4));
       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
                          Operation.getValue(1));
@@ -27738,7 +27741,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
       unsigned Opcode;
       switch (IntNo) {
-      default: llvm_unreachable("Impossible intrinsic!");
+      default:
+        llvm_unreachable("Impossible intrinsic!");
       case Intrinsic::x86_enqcmd:
         Opcode = X86ISD::ENQCMD;
         break;
@@ -27762,7 +27766,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       unsigned Opcode;
 
       switch (IntNo) {
-      default: llvm_unreachable("Impossible intrinsic");
+      default:
+        llvm_unreachable("Impossible intrinsic");
       case Intrinsic::x86_aesenc128kl:
         Opcode = X86ISD::AESENC128KL;
         break;
@@ -27800,7 +27805,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       unsigned Opcode;
 
       switch (IntNo) {
-      default: llvm_unreachable("Impossible intrinsic");
+      default:
+        llvm_unreachable("Impossible intrinsic");
       case Intrinsic::x86_aesencwide128kl:
         Opcode = X86ISD::AESENCWIDE128KL;
         break;
@@ -27955,9 +27961,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       SDValue Src2 = Op.getOperand(4);
       SDValue CC = Op.getOperand(5);
       MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
-      SDValue Operation = DAG.getMemIntrinsicNode(
-          X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
-          MVT::i32, MMO);
+      SDValue Operation =
+          DAG.getMemIntrinsicNode(X86ISD::CMPCCXADD, DL, Op->getVTList(),
+                                  {Chain, Addr, Src1, Src2, CC}, MVT::i32, MMO);
       return Operation;
     }
     case Intrinsic::x86_aadd32:
@@ -28041,8 +28047,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   }
 
   SDLoc dl(Op);
-  switch(IntrData->Type) {
-  default: llvm_unreachable("Unknown Intrinsic Type");
+  switch (IntrData->Type) {
+  default:
+    llvm_unreachable("Unknown Intrinsic Type");
   case RDSEED:
   case RDRAND: {
     // Emit the node with the right value type.
@@ -28063,32 +28070,32 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   }
   case GATHER_AVX2: {
     SDValue Chain = Op.getOperand(0);
-    SDValue Src   = Op.getOperand(2);
-    SDValue Base  = Op.getOperand(3);
+    SDValue Src = Op.getOperand(2);
+    SDValue Base = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
-    SDValue Mask  = Op.getOperand(5);
+    SDValue Mask = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
                              Scale, Chain, Subtarget);
   }
   case GATHER: {
-  //gather(v1, mask, index, base, scale);
+    // gather(v1, mask, index, base, scale);
     SDValue Chain = Op.getOperand(0);
-    SDValue Src   = Op.getOperand(2);
-    SDValue Base  = Op.getOperand(3);
+    SDValue Src = Op.getOperand(2);
+    SDValue Base = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
-    SDValue Mask  = Op.getOperand(5);
+    SDValue Mask = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
-    return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
-                         Chain, Subtarget);
+    return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+                         Subtarget);
   }
   case SCATTER: {
-  //scatter(base, mask, index, v1, scale);
+    // scatter(base, mask, index, v1, scale);
     SDValue Chain = Op.getOperand(0);
-    SDValue Base  = Op.getOperand(2);
-    SDValue Mask  = Op.getOperand(3);
+    SDValue Base = Op.getOperand(2);
+    SDValue Mask = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
-    SDValue Src   = Op.getOperand(5);
+    SDValue Src = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
                           Scale, Chain, Subtarget);
@@ -28099,9 +28106,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
            "Wrong prefetch hint in intrinsic: should be 2 or 3");
     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
     SDValue Chain = Op.getOperand(0);
-    SDValue Mask  = Op.getOperand(2);
+    SDValue Mask = Op.getOperand(2);
     SDValue Index = Op.getOperand(3);
-    SDValue Base  = Op.getOperand(4);
+    SDValue Base = Op.getOperand(4);
     SDValue Scale = Op.getOperand(5);
     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
                            Subtarget);
@@ -28136,8 +28143,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
 
     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
-    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
-                       Ret, SDValue(InTrans.getNode(), 1));
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret,
+                       SDValue(InTrans.getNode(), 1));
   }
   case TRUNCATE_TO_MEM_VI8:
   case TRUNCATE_TO_MEM_VI16:
@@ -28150,7 +28157,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
     assert(MemIntr && "Expected MemIntrinsicSDNode!");
 
-    EVT MemVT  = MemIntr->getMemoryVT();
+    EVT MemVT = MemIntr->getMemoryVT();
 
     uint16_t TruncationOp = IntrData->Opc0;
     switch (TruncationOp) {
@@ -28248,7 +28255,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
   Register FrameReg =
       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
-  SDLoc dl(Op);  // FIXME probably not meaningful
+  SDLoc dl(Op); // FIXME probably not meaningful
   unsigned Depth = Op.getConstantOperandVal(0);
   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
           (FrameReg == X86::EBP && VT == MVT::i32)) &&
@@ -28262,7 +28269,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
+Register X86TargetLowering::getRegisterByName(const char *RegName, LLT VT,
                                               const MachineFunction &MF) const {
   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
 
@@ -28322,10 +28329,10 @@ bool X86TargetLowering::needsFixedCatchObjects() const {
 }
 
 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain     = Op.getOperand(0);
-  SDValue Offset    = Op.getOperand(1);
-  SDValue Handler   = Op.getOperand(2);
-  SDLoc dl      (Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Offset = Op.getOperand(1);
+  SDValue Handler = Op.getOperand(2);
+  SDLoc dl(Op);
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -28336,9 +28343,9 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
   Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
 
-  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
-                                 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
-                                                       dl));
+  SDValue StoreAddr =
+      DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
+                  DAG.getIntPtrConstant(RegInfo->getSlotSize(), dl));
   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
@@ -28361,19 +28368,20 @@ SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
   }
   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
-                     DAG.getVTList(MVT::i32, MVT::Other),
-                     Op.getOperand(0), Op.getOperand(1));
+                     DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
+                     Op.getOperand(1));
 }
 
 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
-  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
-                     Op.getOperand(0), Op.getOperand(1));
+  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0),
+                     Op.getOperand(1));
 }
 
-SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
-                                                       SelectionDAG &DAG) const {
+SDValue
+X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
+                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
                      Op.getOperand(0));
@@ -28389,7 +28397,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
-  SDLoc dl (Op);
+  SDLoc dl(Op);
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -28398,7 +28406,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
     SDValue OutChains[6];
 
     // Large code-model.
-    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
+    const unsigned char JMP64r = 0xFF;  // 64-bit jmp through register opcode.
     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
 
     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
@@ -28446,7 +28454,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   } else {
     const Function *Func =
-      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+        cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
     CallingConv::ID CC = Func->getCallingConv();
     unsigned NestReg;
 
@@ -28468,7 +28476,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
         unsigned Idx = 0;
 
         for (FunctionType::param_iterator I = FTy->param_begin(),
-             E = FTy->param_end(); I != E; ++I, ++Idx)
+                                          E = FTy->param_end();
+             I != E; ++I, ++Idx)
           if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
             const DataLayout &DL = DAG.getDataLayout();
             // FIXME: should only count parameters that are lowered to integers.
@@ -28574,18 +28583,16 @@ SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
   Chain = CWD.getValue(1);
 
   // Mask and turn the control bits into a shift for the lookup table.
-  SDValue Shift =
-    DAG.getNode(ISD::SRL, DL, MVT::i16,
-                DAG.getNode(ISD::AND, DL, MVT::i16,
-                            CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
-                DAG.getConstant(9, DL, MVT::i8));
+  SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i16,
+                              DAG.getNode(ISD::AND, DL, MVT::i16, CWD,
+                                          DAG.getConstant(0xc00, DL, MVT::i16)),
+                              DAG.getConstant(9, DL, MVT::i8));
   Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
 
   SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
-  SDValue RetVal =
-    DAG.getNode(ISD::AND, DL, MVT::i32,
-                DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
-                DAG.getConstant(3, DL, MVT::i32));
+  SDValue RetVal = DAG.getNode(ISD::AND, DL, MVT::i32,
+                               DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
+                               DAG.getConstant(3, DL, MVT::i32));
 
   RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
 
@@ -28625,14 +28632,14 @@ SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
     uint64_t RM = CVal->getZExtValue();
     int FieldVal;
     switch (static_cast<RoundingMode>(RM)) {
-    // clang-format off
+      // clang-format off
     case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
     case RoundingMode::TowardNegative:    FieldVal = X86::rmDownward; break;
     case RoundingMode::TowardPositive:    FieldVal = X86::rmUpward; break;
     case RoundingMode::TowardZero:        FieldVal = X86::rmTowardZero; break;
     default:
       llvm_unreachable("rounding mode is not supported by X86 hardware");
-    // clang-format on
+      // clang-format on
     }
     RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
   } else {
@@ -28873,17 +28880,15 @@ static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElems = VT.getVectorNumElements();
 
-  assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
-          "Unsupported element type");
+  assert((EltVT == MVT::i8 || EltVT == MVT::i16) && "Unsupported element type");
 
   // Split vector, it's Lo and Hi parts will be handled in next iteration.
-  if (NumElems > 16 ||
-      (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
+  if (NumElems > 16 || (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
     return splitVectorIntUnary(Op, DAG, dl);
 
   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
-          "Unsupported value type for operation");
+         "Unsupported value type for operation");
 
   // Use native supported vector instruction vplzcntd.
   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
@@ -28998,7 +29003,35 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
 }
+static SDValue LowerVectorCTLZ_GFNI(SDValue Op, SelectionDAG &DAG,
+                                    const X86Subtarget &Subtarget) {
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  SDValue Input = Op.getOperand(0);
 
+  if (!VT.isVector() || VT.getVectorElementType() != MVT::i8)
+    return SDValue();
+  SmallVector<SDValue, 16> MatrixVals;
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+    uint8_t mask = 1 << (7 - (i % 8));
+    MatrixVals.push_back(DAG.getConstant(mask, dl, MVT::i8));
+  }
+
+  SDValue Matrix = DAG.getBuildVector(VT, dl, MatrixVals);
+  SDValue Reversed = DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, Input, Matrix,
+                                 DAG.getTargetConstant(0, dl, MVT::i8));
+  SDValue AddMask = DAG.getConstant(0xFF, dl, MVT::i8);
+
+  SDValue AddVec = DAG.getSplatBuildVector(VT, dl, AddMask);
+  SDValue Summed = DAG.getNode(ISD::ADD, dl, VT, Reversed, AddVec);
+  SDValue NotSummed = DAG.getNode(ISD::XOR, dl, VT, Summed, AddVec);
+  SDValue Filtered = DAG.getNode(ISD::AND, dl, VT, NotSummed, Reversed);
+  SDValue FinalMatrix = DAG.getBuildVector(VT, dl, MatrixVals);
+  SDValue LZCNT =
+      DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, Filtered, FinalMatrix,
+                  DAG.getTargetConstant(8, dl, MVT::i8));
+  return LZCNT;
+}
 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
                          SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
@@ -29007,6 +29040,10 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc dl(Op);
   unsigned Opc = Op.getOpcode();
 
+  
+  if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
+    return LowerVectorCTLZ_GFNI(Op, DAG, Subtarget);
+
   if (VT.isVector())
     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
 
@@ -29525,10 +29562,10 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
       SmallVector<SDValue, 16> LoOps, HiOps;
       for (unsigned i = 0; i != NumElts; i += 16) {
         for (unsigned j = 0; j != 8; ++j) {
-          LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
-                                               MVT::i16));
-          HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
-                                               MVT::i16));
+          LoOps.push_back(
+              DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl, MVT::i16));
+          HiOps.push_back(
+              DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl, MVT::i16));
         }
       }
 
@@ -29569,7 +29606,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
 
     // Merge the two vectors back together with a shuffle. This expands into 2
     // shuffles.
-    static const int ShufMask[] = { 0, 4, 2, 6 };
+    static const int ShufMask[] = {0, 4, 2, 6};
     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
   }
 
@@ -29734,7 +29771,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
     //
     // Place the odd value at an even position (basically, shift all values 1
     // step to the left):
-    const int Mask[] = {1, -1,  3, -1,  5, -1,  7, -1,
+    const int Mask[] = {1, -1, 3,  -1, 5,  -1, 7,  -1,
                         9, -1, 11, -1, 13, -1, 15, -1};
     // <a|b|c|d> => <b|undef|d|undef>
     SDValue Odd0 =
@@ -29784,7 +29821,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
 
   // Only i8 vectors should need custom lowering after this.
   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
-         (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
+          (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
          "Unsupported vector type");
 
   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
@@ -29939,7 +29976,8 @@ static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
   return DAG.getMergeValues({Low, Ovf}, dl);
 }
 
-SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op,
+                                             SelectionDAG &DAG) const {
   assert(Subtarget.isTargetWin64() && "Unexpected target");
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
@@ -29954,13 +29992,13 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
   RTLIB::Libcall LC;
   bool isSigned;
   switch (Op->getOpcode()) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Unexpected request for libcall!");
   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
-  // clang-format on
+    // clang-format on
   }
 
   SDLoc dl(Op);
@@ -30104,9 +30142,9 @@ static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
 
 // The shift amount is a variable, but it is the same for all vector lanes.
 // These instructions are defined together with shift-immediate.
-static
-bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget,
-                                      unsigned Opcode) {
+static bool supportedVectorShiftWithBaseAmnt(EVT VT,
+                                             const X86Subtarget &Subtarget,
+                                             unsigned Opcode) {
   return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
 }
 
@@ -30135,7 +30173,7 @@ static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
     return true;
 
   bool LShift = VT.is128BitVector() || VT.is256BitVector();
-  bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
+  bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
   return (Opcode == ISD::SRA) ? AShift : LShift;
 }
 
@@ -32089,7 +32127,8 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   auto SSID = AI->getSyncScopeID();
   // We must restrict the ordering to avoid generating loads with Release or
   // ReleaseAcquire orderings.
-  auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
+  auto Order =
+      AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
 
   // Before the load we need a fence. Here is an example lifted from
   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
@@ -32158,31 +32197,28 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
 
   if (Subtarget.is64Bit()) {
     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
-    SDValue Ops[] = {
-      DAG.getRegister(X86::RSP, MVT::i64),                  // Base
-      DAG.getTargetConstant(1, DL, MVT::i8),                // Scale
-      DAG.getRegister(0, MVT::i64),                         // Index
-      DAG.getTargetConstant(SPOffset, DL, MVT::i32),        // Disp
-      DAG.getRegister(0, MVT::i16),                         // Segment.
-      Zero,
-      Chain};
-    SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
-                                     MVT::Other, Ops);
+    SDValue Ops[] = {DAG.getRegister(X86::RSP, MVT::i64),           // Base
+                     DAG.getTargetConstant(1, DL, MVT::i8),         // Scale
+                     DAG.getRegister(0, MVT::i64),                  // Index
+                     DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+                     DAG.getRegister(0, MVT::i16),                  // Segment.
+                     Zero,
+                     Chain};
+    SDNode *Res =
+        DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, MVT::Other, Ops);
     return SDValue(Res, 1);
   }
 
   SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
-  SDValue Ops[] = {
-    DAG.getRegister(X86::ESP, MVT::i32),            // Base
-    DAG.getTargetConstant(1, DL, MVT::i8),          // Scale
-    DAG.getRegister(0, MVT::i32),                   // Index
-    DAG.getTargetConstant(SPOffset, DL, MVT::i32),  // Disp
-    DAG.getRegister(0, MVT::i16),                   // Segment.
-    Zero,
-    Chain
-  };
-  SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
-                                   MVT::Other, Ops);
+  SDValue Ops[] = {DAG.getRegister(X86::ESP, MVT::i32),           // Base
+                   DAG.getTargetConstant(1, DL, MVT::i8),         // Scale
+                   DAG.getRegister(0, MVT::i32),                  // Index
+                   DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
+                   DAG.getRegister(0, MVT::i16),                  // Segment.
+                   Zero,
+                   Chain};
+  SDNode *Res =
+      DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, MVT::Other, Ops);
   return SDValue(Res, 1);
 }
 
@@ -32215,36 +32251,44 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc DL(Op);
   unsigned Reg = 0;
   unsigned size = 0;
-  switch(T.SimpleTy) {
-  default: llvm_unreachable("Invalid value type!");
-  case MVT::i8:  Reg = X86::AL;  size = 1; break;
-  case MVT::i16: Reg = X86::AX;  size = 2; break;
-  case MVT::i32: Reg = X86::EAX; size = 4; break;
+  switch (T.SimpleTy) {
+  default:
+    llvm_unreachable("Invalid value type!");
+  case MVT::i8:
+    Reg = X86::AL;
+    size = 1;
+    break;
+  case MVT::i16:
+    Reg = X86::AX;
+    size = 2;
+    break;
+  case MVT::i32:
+    Reg = X86::EAX;
+    size = 4;
+    break;
   case MVT::i64:
     assert(Subtarget.is64Bit() && "Node not type legal!");
-    Reg = X86::RAX; size = 8;
+    Reg = X86::RAX;
+    size = 8;
     break;
   }
-  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
-                                  Op.getOperand(2), SDValue());
-  SDValue Ops[] = { cpIn.getValue(0),
-                    Op.getOperand(1),
-                    Op.getOperand(3),
-                    DAG.getTargetConstant(size, DL, MVT::i8),
-                    cpIn.getValue(1) };
+  SDValue cpIn =
+      DAG.getCopyToReg(Op.getOperand(0), DL, Reg, Op.getOperand(2), SDValue());
+  SDValue Ops[] = {cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3),
+                   DAG.getTargetConstant(size, DL, MVT::i8), cpIn.getValue(1)};
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
-  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
-                                           Ops, T, MMO);
+  SDValue Result =
+      DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO);
 
   SDValue cpOut =
-    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
+      DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
                                       MVT::i32, cpOut.getValue(2));
   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
 
-  return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
-                     cpOut, Success, EFLAGS.getValue(1));
+  return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), cpOut, Success,
+                     EFLAGS.getValue(1));
 }
 
 // Create MOVMSKB, taking into account whether we need to split for AVX1.
@@ -32306,7 +32350,8 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
   }
 
   assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
-          SrcVT == MVT::i64) && "Unexpected VT!");
+          SrcVT == MVT::i64) &&
+         "Unexpected VT!");
 
   assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
   if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
@@ -32320,8 +32365,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
     // Example: from MVT::v2i32 to MVT::v4i32.
     MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
                                  SrcVT.getVectorNumElements() * 2);
-    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
-                      DAG.getUNDEF(SrcVT));
+    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, DAG.getUNDEF(SrcVT));
   } else {
     assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
            "Unexpected source type in LowerBITCAST");
@@ -32467,7 +32511,8 @@ static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL,
   if (Subtarget.hasVPOPCNTDQ()) {
     unsigned NumElems = VT.getVectorNumElements();
     assert((VT.getVectorElementType() == MVT::i8 ||
-            VT.getVectorElementType() == MVT::i16) && "Unexpected type");
+            VT.getVectorElementType() == MVT::i16) &&
+           "Unexpected type");
     if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
       MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
@@ -32865,16 +32910,16 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
     SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
     assert(!N->hasAnyUseOfValue(0));
     // NOTE: The getUNDEF is needed to give something for the unused result 0.
-    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
-                       DAG.getUNDEF(VT), NewChain);
+    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT),
+                       NewChain);
   }
 
   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
   // RAUW the chain, but don't worry about the result, as it's unused.
   assert(!N->hasAnyUseOfValue(0));
   // NOTE: The getUNDEF is needed to give something for the unused result 0.
-  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
-                     DAG.getUNDEF(VT), LockOp.getValue(1));
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), DAG.getUNDEF(VT),
+                     LockOp.getValue(1));
 }
 
 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
@@ -32974,17 +33019,17 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
   // Set the carry flag.
   SDValue Carry = Op.getOperand(2);
   EVT CarryVT = Carry.getValueType();
-  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
-                      Carry, DAG.getAllOnesConstant(DL, CarryVT));
+  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), Carry,
+                      DAG.getAllOnesConstant(DL, CarryVT));
 
   bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
-  SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
-                            Op.getOperand(0), Op.getOperand(1),
-                            Carry.getValue(1));
+  SDValue Sum =
+      DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs, Op.getOperand(0),
+                  Op.getOperand(1), Carry.getValue(1));
 
   bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
-  SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
-                           Sum.getValue(1), DL, DAG);
+  SDValue SetCC =
+      getSETCC(IsSigned ? X86::COND_O : X86::COND_B, Sum.getValue(1), DL, DAG);
   if (N->getValueType(1) == MVT::i1)
     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
 
@@ -33136,8 +33181,8 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
       !Index.getSimpleValueType().is512BitVector()) {
     // Determine how much we need to widen by to get a 512-bit type.
-    unsigned Factor = std::min(512/VT.getSizeInBits(),
-                               512/IndexVT.getSizeInBits());
+    unsigned Factor =
+        std::min(512 / VT.getSizeInBits(), 512 / IndexVT.getSizeInBits());
     unsigned NumElts = VT.getVectorNumElements() * Factor;
 
     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
@@ -33179,7 +33224,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
         N->isExpandingLoad());
     // Emit a blend.
     SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
-    return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
+    return DAG.getMergeValues({Select, NewLoad.getValue(1)}, dl);
   }
 
   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
@@ -33191,10 +33236,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
          "Cannot lower masked load op.");
 
-  assert((ScalarVT.getSizeInBits() >= 32 ||
-          (Subtarget.hasBWI() &&
-              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
-         "Unsupported masked load op.");
+  assert(
+      (ScalarVT.getSizeInBits() >= 32 ||
+       (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+      "Unsupported masked load op.");
 
   // This operation is legal for targets with VLX, but without
   // VLX the vector should be widened to 512 bit
@@ -33239,14 +33284,14 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
          "Cannot lower masked store op.");
 
-  assert((ScalarVT.getSizeInBits() >= 32 ||
-          (Subtarget.hasBWI() &&
-              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
-          "Unsupported masked store op.");
+  assert(
+      (ScalarVT.getSizeInBits() >= 32 ||
+       (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
+      "Unsupported masked store op.");
 
   // This operation is legal for targets with VLX, but without
   // VLX the vector should be widened to 512 bit
-  unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
+  unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
 
   // Mask element has to be i1.
@@ -33288,8 +33333,8 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
   if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
       !IndexVT.is512BitVector()) {
     // Determine how much we need to widen by to get a 512-bit type.
-    unsigned Factor = std::min(512/VT.getSizeInBits(),
-                               512/IndexVT.getSizeInBits());
+    unsigned Factor =
+        std::min(512 / VT.getSizeInBits(), 512 / IndexVT.getSizeInBits());
 
     unsigned NumElts = VT.getVectorNumElements() * Factor;
 
@@ -33306,8 +33351,8 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
   if (PassThru.isUndef())
     PassThru = getZeroVector(VT, Subtarget, DAG, dl);
 
-  SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
-                    N->getScale() };
+  SDValue Ops[] = {N->getChain(),   PassThru, Mask,
+                   N->getBasePtr(), Index,    N->getScale()};
   SDValue NewGather = DAG.getMemIntrinsicNode(
       X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
       N->getMemOperand());
@@ -33505,7 +33550,7 @@ SDValue X86TargetLowering::visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL,
 /// Provide custom lowering hooks for some operations.
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Should not custom lower this!");
   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
@@ -33661,14 +33706,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
   case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
   case ISD::PREFETCH:           return LowerPREFETCH(Op, Subtarget, DAG);
-  // clang-format on
+    // clang-format on
   }
 }
 
 /// Replace a node with an illegal result type with a new node built out of
 /// custom code.
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
-                                           SmallVectorImpl<SDValue>&Results,
+                                           SmallVectorImpl<SDValue> &Results,
                                            SelectionDAG &DAG) const {
   SDLoc dl(N);
   unsigned Opc = N->getOpcode();
@@ -33794,8 +33839,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
 
     // Widen the result with by padding with undef.
-    Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
-                      DAG.getUNDEF(VT));
+    Res =
+        DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, DAG.getUNDEF(VT));
     Results.push_back(Res);
     Results.push_back(Ovf);
     return;
@@ -33812,11 +33857,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
            "Unexpected type action!");
     unsigned NumConcat = 128 / InVT.getSizeInBits();
 
-    EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
-                                    InVT.getVectorElementType(),
-                                    NumConcat * InVT.getVectorNumElements());
-    EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
-                                  VT.getVectorElementType(),
+    EVT InWideVT =
+        EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
+                         NumConcat * InVT.getVectorNumElements());
+    EVT WideVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
                                   NumConcat * VT.getVectorNumElements());
 
     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
@@ -33880,7 +33924,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
-    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+    SDValue V = LowerWin64_i128OP(SDValue(N, 0), DAG);
     Results.push_back(V);
     return;
   }
@@ -33958,9 +34002,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
 
       Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
       Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
-      SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
-                                         { 0,  1,  2,  3, 16, 17, 18, 19,
-                                          -1, -1, -1, -1, -1, -1, -1, -1 });
+      SDValue Res = DAG.getVectorShuffle(
+          MVT::v16i8, dl, Lo, Hi,
+          {0, 1, 2, 3, 16, 17, 18, 19, -1, -1, -1, -1, -1, -1, -1, -1});
       Results.push_back(Res);
       return;
     }
@@ -33992,7 +34036,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue In = N->getOperand(0);
     EVT InVT = In.getValueType();
     if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
-        (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
+        (InVT == MVT::v4i16 || InVT == MVT::v4i8)) {
       assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&
              "Unexpected type action!");
       assert(Opc == ISD::SIGN_EXTEND && "Unexpected opcode");
@@ -34008,11 +34052,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
 
       // Create an unpackl and unpackh to interleave the sign bits then bitcast
       // to v2i64.
-      SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
-                                        {0, 4, 1, 5});
+      SDValue Lo =
+          DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, {0, 4, 1, 5});
       Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
-      SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
-                                        {2, 6, 3, 7});
+      SDValue Hi =
+          DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, {2, 6, 3, 7});
       Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
 
       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
@@ -34199,7 +34243,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
-
     if (VT == MVT::v2i32) {
       assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
              "Strict unsigned conversion requires AVX512");
@@ -34284,9 +34327,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       }
 
       SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
-      SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
-                                DAG.getConstantFP(0.0, dl, VecInVT), Src,
-                                ZeroIdx);
+      SDValue Res =
+          DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
+                      DAG.getConstantFP(0.0, dl, VecInVT), Src, ZeroIdx);
       SDValue Chain;
       if (IsStrict) {
         SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
@@ -34373,8 +34416,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT SrcVT = Src.getValueType();
     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
       if (IsStrict) {
-        unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
-                                : X86ISD::STRICT_CVTUI2P;
+        unsigned Opc =
+            IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
         SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
                                   {N->getOperand(0), Src});
         Results.push_back(Res);
@@ -34388,7 +34431,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
         Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
       SDValue Zero = DAG.getConstant(0, dl, SrcVT);
-      SDValue One  = DAG.getConstant(1, dl, SrcVT);
+      SDValue One = DAG.getConstant(1, dl, SrcVT);
       SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
                                  DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
                                  DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
@@ -34454,9 +34497,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     if (IsStrict) {
       SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
                                 {N->getOperand(0), Or, VBias});
-      SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
-                                {MVT::v4f32, MVT::Other},
-                                {Sub.getValue(1), Sub});
+      SDValue Res =
+          DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
+                      {Sub.getValue(1), Sub});
       Results.push_back(Res);
       Results.push_back(Res.getValue(1));
     } else {
@@ -34537,8 +34580,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = N->getConstantOperandVal(1);
     switch (IntNo) {
-    default : llvm_unreachable("Do not know how to custom type "
-                               "legalize this intrinsic operation!");
+    default:
+      llvm_unreachable("Do not know how to custom type "
+                       "legalize this intrinsic operation!");
     case Intrinsic::x86_rdtsc:
       return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
                                      Results);
@@ -34551,7 +34595,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     case Intrinsic::x86_rdpru:
       expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
-        Results);
+                                  Results);
       return;
     case Intrinsic::x86_xgetbv:
       expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
@@ -34608,12 +34652,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
 
     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
-                                        Regs64bit ? X86::RAX : X86::EAX,
-                                        HalfT, Result.getValue(1));
+                                        Regs64bit ? X86::RAX : X86::EAX, HalfT,
+                                        Result.getValue(1));
     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
-                                        Regs64bit ? X86::RDX : X86::EDX,
-                                        HalfT, cpOutL.getValue(2));
-    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+                                        Regs64bit ? X86::RDX : X86::EDX, HalfT,
+                                        cpOutL.getValue(2));
+    SDValue OpsF[] = {cpOutL.getValue(0), cpOutH.getValue(0)};
 
     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
                                         MVT::i32, cpOutH.getValue(2));
@@ -34655,7 +34699,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         // Then extract the lower 64-bits.
         MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
         SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
-        SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
+        SDValue Ops[] = {Node->getChain(), Node->getBasePtr()};
         SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
                                              MVT::i64, Node->getMemOperand());
         if (Subtarget.hasSSE2()) {
@@ -34679,10 +34723,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         // First load this into an 80-bit X87 register. This will put the whole
         // integer into the significand.
         SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
-        SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
-        SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
-                                                 dl, Tys, Ops, MVT::i64,
-                                                 Node->getMemOperand());
+        SDValue Ops[] = {Node->getChain(), Node->getBasePtr()};
+        SDValue Result = DAG.getMemIntrinsicNode(
+            X86ISD::FILD, dl, Tys, Ops, MVT::i64, Node->getMemOperand());
         SDValue Chain = Result.getValue(1);
 
         // Now store the X87 register to a stack temporary and convert to i64.
@@ -34693,7 +34736,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
         MachinePointerInfo MPI =
             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
-        SDValue StoreOps[] = { Chain, Result, StackPtr };
+        SDValue StoreOps[] = {Chain, Result, StackPtr};
         Chain = DAG.getMemIntrinsicNode(
             X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
             MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
@@ -34751,8 +34794,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
              "Unexpected type action!");
       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
-      SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
-                                N->getOperand(0));
+      SDValue Res =
+          DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, N->getOperand(0));
       Res = DAG.getBitcast(WideVT, Res);
       Results.push_back(Res);
       return;
@@ -34774,8 +34817,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       SDValue Mask = Gather->getMask();
       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
       SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
-                                     Gather->getPassThru(),
-                                     DAG.getUNDEF(VT));
+                                     Gather->getPassThru(), DAG.getUNDEF(VT));
       if (!Subtarget.hasVLX()) {
         // We need to widen the mask, but the instruction will only use 2
         // of its elements. So we can use undef.
@@ -34783,8 +34825,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                            DAG.getUNDEF(MVT::v2i1));
         Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
       }
-      SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
-                        Gather->getBasePtr(), Index, Gather->getScale() };
+      SDValue Ops[] = {Gather->getChain(),   PassThru, Mask,
+                       Gather->getBasePtr(), Index,    Gather->getScale()};
       SDValue Res = DAG.getMemIntrinsicNode(
           X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
           Gather->getMemoryVT(), Gather->getMemOperand());
@@ -34829,7 +34871,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::ADDRSPACECAST: {
-    SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
+    SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
     Results.push_back(V);
     return;
   }
@@ -34860,471 +34902,474 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
 
 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((X86ISD::NodeType)Opcode) {
-  case X86ISD::FIRST_NUMBER:       break;
-#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
-  NODE_NAME_CASE(BSF)
-  NODE_NAME_CASE(BSR)
-  NODE_NAME_CASE(FSHL)
-  NODE_NAME_CASE(FSHR)
-  NODE_NAME_CASE(FAND)
-  NODE_NAME_CASE(FANDN)
-  NODE_NAME_CASE(FOR)
-  NODE_NAME_CASE(FXOR)
-  NODE_NAME_CASE(FILD)
-  NODE_NAME_CASE(FIST)
-  NODE_NAME_CASE(FP_TO_INT_IN_MEM)
-  NODE_NAME_CASE(FLD)
-  NODE_NAME_CASE(FST)
-  NODE_NAME_CASE(CALL)
-  NODE_NAME_CASE(CALL_RVMARKER)
-  NODE_NAME_CASE(IMP_CALL)
-  NODE_NAME_CASE(BT)
-  NODE_NAME_CASE(CMP)
-  NODE_NAME_CASE(FCMP)
-  NODE_NAME_CASE(STRICT_FCMP)
-  NODE_NAME_CASE(STRICT_FCMPS)
-  NODE_NAME_CASE(COMI)
-  NODE_NAME_CASE(UCOMI)
-  NODE_NAME_CASE(COMX)
-  NODE_NAME_CASE(UCOMX)
-  NODE_NAME_CASE(CMPM)
-  NODE_NAME_CASE(CMPMM)
-  NODE_NAME_CASE(STRICT_CMPM)
-  NODE_NAME_CASE(CMPMM_SAE)
-  NODE_NAME_CASE(SETCC)
-  NODE_NAME_CASE(SETCC_CARRY)
-  NODE_NAME_CASE(FSETCC)
-  NODE_NAME_CASE(FSETCCM)
-  NODE_NAME_CASE(FSETCCM_SAE)
-  NODE_NAME_CASE(CMOV)
-  NODE_NAME_CASE(BRCOND)
-  NODE_NAME_CASE(RET_GLUE)
-  NODE_NAME_CASE(IRET)
-  NODE_NAME_CASE(REP_STOS)
-  NODE_NAME_CASE(REP_MOVS)
-  NODE_NAME_CASE(GlobalBaseReg)
-  NODE_NAME_CASE(Wrapper)
-  NODE_NAME_CASE(WrapperRIP)
-  NODE_NAME_CASE(MOVQ2DQ)
-  NODE_NAME_CASE(MOVDQ2Q)
-  NODE_NAME_CASE(MMX_MOVD2W)
-  NODE_NAME_CASE(MMX_MOVW2D)
-  NODE_NAME_CASE(PEXTRB)
-  NODE_NAME_CASE(PEXTRW)
-  NODE_NAME_CASE(INSERTPS)
-  NODE_NAME_CASE(PINSRB)
-  NODE_NAME_CASE(PINSRW)
-  NODE_NAME_CASE(PSHUFB)
-  NODE_NAME_CASE(ANDNP)
-  NODE_NAME_CASE(BLENDI)
-  NODE_NAME_CASE(BLENDV)
-  NODE_NAME_CASE(HADD)
-  NODE_NAME_CASE(HSUB)
-  NODE_NAME_CASE(FHADD)
-  NODE_NAME_CASE(FHSUB)
-  NODE_NAME_CASE(CONFLICT)
-  NODE_NAME_CASE(FMAX)
-  NODE_NAME_CASE(FMAXS)
-  NODE_NAME_CASE(FMAX_SAE)
-  NODE_NAME_CASE(FMAXS_SAE)
-  NODE_NAME_CASE(STRICT_FMAX)
-  NODE_NAME_CASE(FMIN)
-  NODE_NAME_CASE(FMINS)
-  NODE_NAME_CASE(FMIN_SAE)
-  NODE_NAME_CASE(FMINS_SAE)
-  NODE_NAME_CASE(STRICT_FMIN)
-  NODE_NAME_CASE(FMAXC)
-  NODE_NAME_CASE(FMINC)
-  NODE_NAME_CASE(FRSQRT)
-  NODE_NAME_CASE(FRCP)
-  NODE_NAME_CASE(EXTRQI)
-  NODE_NAME_CASE(INSERTQI)
-  NODE_NAME_CASE(TLSADDR)
-  NODE_NAME_CASE(TLSBASEADDR)
-  NODE_NAME_CASE(TLSCALL)
-  NODE_NAME_CASE(TLSDESC)
-  NODE_NAME_CASE(EH_SJLJ_SETJMP)
-  NODE_NAME_CASE(EH_SJLJ_LONGJMP)
-  NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
-  NODE_NAME_CASE(EH_RETURN)
-  NODE_NAME_CASE(TC_RETURN)
-  NODE_NAME_CASE(FNSTCW16m)
-  NODE_NAME_CASE(FLDCW16m)
-  NODE_NAME_CASE(FNSTENVm)
-  NODE_NAME_CASE(FLDENVm)
-  NODE_NAME_CASE(LCMPXCHG_DAG)
-  NODE_NAME_CASE(LCMPXCHG8_DAG)
-  NODE_NAME_CASE(LCMPXCHG16_DAG)
-  NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
-  NODE_NAME_CASE(LADD)
-  NODE_NAME_CASE(LSUB)
-  NODE_NAME_CASE(LOR)
-  NODE_NAME_CASE(LXOR)
-  NODE_NAME_CASE(LAND)
-  NODE_NAME_CASE(LBTS)
-  NODE_NAME_CASE(LBTC)
-  NODE_NAME_CASE(LBTR)
-  NODE_NAME_CASE(LBTS_RM)
-  NODE_NAME_CASE(LBTC_RM)
-  NODE_NAME_CASE(LBTR_RM)
-  NODE_NAME_CASE(AADD)
-  NODE_NAME_CASE(AOR)
-  NODE_NAME_CASE(AXOR)
-  NODE_NAME_CASE(AAND)
-  NODE_NAME_CASE(VZEXT_MOVL)
-  NODE_NAME_CASE(VZEXT_LOAD)
-  NODE_NAME_CASE(VEXTRACT_STORE)
-  NODE_NAME_CASE(VTRUNC)
-  NODE_NAME_CASE(VTRUNCS)
-  NODE_NAME_CASE(VTRUNCUS)
-  NODE_NAME_CASE(VMTRUNC)
-  NODE_NAME_CASE(VMTRUNCS)
-  NODE_NAME_CASE(VMTRUNCUS)
-  NODE_NAME_CASE(VTRUNCSTORES)
-  NODE_NAME_CASE(VTRUNCSTOREUS)
-  NODE_NAME_CASE(VMTRUNCSTORES)
-  NODE_NAME_CASE(VMTRUNCSTOREUS)
-  NODE_NAME_CASE(VFPEXT)
-  NODE_NAME_CASE(STRICT_VFPEXT)
-  NODE_NAME_CASE(VFPEXT_SAE)
-  NODE_NAME_CASE(VFPEXTS)
-  NODE_NAME_CASE(VFPEXTS_SAE)
-  NODE_NAME_CASE(VFPROUND)
-  NODE_NAME_CASE(VFPROUND2)
-  NODE_NAME_CASE(VFPROUND2_RND)
-  NODE_NAME_CASE(STRICT_VFPROUND)
-  NODE_NAME_CASE(VMFPROUND)
-  NODE_NAME_CASE(VFPROUND_RND)
-  NODE_NAME_CASE(VFPROUNDS)
-  NODE_NAME_CASE(VFPROUNDS_RND)
-  NODE_NAME_CASE(VSHLDQ)
-  NODE_NAME_CASE(VSRLDQ)
-  NODE_NAME_CASE(VSHL)
-  NODE_NAME_CASE(VSRL)
-  NODE_NAME_CASE(VSRA)
-  NODE_NAME_CASE(VSHLI)
-  NODE_NAME_CASE(VSRLI)
-  NODE_NAME_CASE(VSRAI)
-  NODE_NAME_CASE(VSHLV)
-  NODE_NAME_CASE(VSRLV)
-  NODE_NAME_CASE(VSRAV)
-  NODE_NAME_CASE(VROTLI)
-  NODE_NAME_CASE(VROTRI)
-  NODE_NAME_CASE(VPPERM)
-  NODE_NAME_CASE(CMPP)
-  NODE_NAME_CASE(STRICT_CMPP)
-  NODE_NAME_CASE(PCMPEQ)
-  NODE_NAME_CASE(PCMPGT)
-  NODE_NAME_CASE(PHMINPOS)
-  NODE_NAME_CASE(ADD)
-  NODE_NAME_CASE(SUB)
-  NODE_NAME_CASE(ADC)
-  NODE_NAME_CASE(SBB)
-  NODE_NAME_CASE(SMUL)
-  NODE_NAME_CASE(UMUL)
-  NODE_NAME_CASE(OR)
-  NODE_NAME_CASE(XOR)
-  NODE_NAME_CASE(AND)
-  NODE_NAME_CASE(BEXTR)
-  NODE_NAME_CASE(BEXTRI)
-  NODE_NAME_CASE(BZHI)
-  NODE_NAME_CASE(PDEP)
-  NODE_NAME_CASE(PEXT)
-  NODE_NAME_CASE(MUL_IMM)
-  NODE_NAME_CASE(MOVMSK)
-  NODE_NAME_CASE(PTEST)
-  NODE_NAME_CASE(TESTP)
-  NODE_NAME_CASE(KORTEST)
-  NODE_NAME_CASE(KTEST)
-  NODE_NAME_CASE(KADD)
-  NODE_NAME_CASE(KSHIFTL)
-  NODE_NAME_CASE(KSHIFTR)
-  NODE_NAME_CASE(PACKSS)
-  NODE_NAME_CASE(PACKUS)
-  NODE_NAME_CASE(PALIGNR)
-  NODE_NAME_CASE(VALIGN)
-  NODE_NAME_CASE(VSHLD)
-  NODE_NAME_CASE(VSHRD)
-  NODE_NAME_CASE(VSHLDV)
-  NODE_NAME_CASE(VSHRDV)
-  NODE_NAME_CASE(PSHUFD)
-  NODE_NAME_CASE(PSHUFHW)
-  NODE_NAME_CASE(PSHUFLW)
-  NODE_NAME_CASE(SHUFP)
-  NODE_NAME_CASE(SHUF128)
-  NODE_NAME_CASE(MOVLHPS)
-  NODE_NAME_CASE(MOVHLPS)
-  NODE_NAME_CASE(MOVDDUP)
-  NODE_NAME_CASE(MOVSHDUP)
-  NODE_NAME_CASE(MOVSLDUP)
-  NODE_NAME_CASE(MOVSD)
-  NODE_NAME_CASE(MOVSS)
-  NODE_NAME_CASE(MOVSH)
-  NODE_NAME_CASE(UNPCKL)
-  NODE_NAME_CASE(UNPCKH)
-  NODE_NAME_CASE(VBROADCAST)
-  NODE_NAME_CASE(VBROADCAST_LOAD)
-  NODE_NAME_CASE(VBROADCASTM)
-  NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
-  NODE_NAME_CASE(VPERMILPV)
-  NODE_NAME_CASE(VPERMILPI)
-  NODE_NAME_CASE(VPERM2X128)
-  NODE_NAME_CASE(VPERMV)
-  NODE_NAME_CASE(VPERMV3)
-  NODE_NAME_CASE(VPERMI)
-  NODE_NAME_CASE(VPTERNLOG)
-  NODE_NAME_CASE(FP_TO_SINT_SAT)
-  NODE_NAME_CASE(FP_TO_UINT_SAT)
-  NODE_NAME_CASE(VFIXUPIMM)
-  NODE_NAME_CASE(VFIXUPIMM_SAE)
-  NODE_NAME_CASE(VFIXUPIMMS)
-  NODE_NAME_CASE(VFIXUPIMMS_SAE)
-  NODE_NAME_CASE(VRANGE)
-  NODE_NAME_CASE(VRANGE_SAE)
-  NODE_NAME_CASE(VRANGES)
-  NODE_NAME_CASE(VRANGES_SAE)
-  NODE_NAME_CASE(PMULUDQ)
-  NODE_NAME_CASE(PMULDQ)
-  NODE_NAME_CASE(PSADBW)
-  NODE_NAME_CASE(DBPSADBW)
-  NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
-  NODE_NAME_CASE(VAARG_64)
-  NODE_NAME_CASE(VAARG_X32)
-  NODE_NAME_CASE(DYN_ALLOCA)
-  NODE_NAME_CASE(MFENCE)
-  NODE_NAME_CASE(SEG_ALLOCA)
-  NODE_NAME_CASE(PROBED_ALLOCA)
-  NODE_NAME_CASE(RDRAND)
-  NODE_NAME_CASE(RDSEED)
-  NODE_NAME_CASE(RDPKRU)
-  NODE_NAME_CASE(WRPKRU)
-  NODE_NAME_CASE(VPMADDUBSW)
-  NODE_NAME_CASE(VPMADDWD)
-  NODE_NAME_CASE(VPSHA)
-  NODE_NAME_CASE(VPSHL)
-  NODE_NAME_CASE(VPCOM)
-  NODE_NAME_CASE(VPCOMU)
-  NODE_NAME_CASE(VPERMIL2)
-  NODE_NAME_CASE(FMSUB)
-  NODE_NAME_CASE(STRICT_FMSUB)
-  NODE_NAME_CASE(FNMADD)
-  NODE_NAME_CASE(STRICT_FNMADD)
-  NODE_NAME_CASE(FNMSUB)
-  NODE_NAME_CASE(STRICT_FNMSUB)
-  NODE_NAME_CASE(FMADDSUB)
-  NODE_NAME_CASE(FMSUBADD)
-  NODE_NAME_CASE(FMADD_RND)
-  NODE_NAME_CASE(FNMADD_RND)
-  NODE_NAME_CASE(FMSUB_RND)
-  NODE_NAME_CASE(FNMSUB_RND)
-  NODE_NAME_CASE(FMADDSUB_RND)
-  NODE_NAME_CASE(FMSUBADD_RND)
-  NODE_NAME_CASE(VFMADDC)
-  NODE_NAME_CASE(VFMADDC_RND)
-  NODE_NAME_CASE(VFCMADDC)
-  NODE_NAME_CASE(VFCMADDC_RND)
-  NODE_NAME_CASE(VFMULC)
-  NODE_NAME_CASE(VFMULC_RND)
-  NODE_NAME_CASE(VFCMULC)
-  NODE_NAME_CASE(VFCMULC_RND)
-  NODE_NAME_CASE(VFMULCSH)
-  NODE_NAME_CASE(VFMULCSH_RND)
-  NODE_NAME_CASE(VFCMULCSH)
-  NODE_NAME_CASE(VFCMULCSH_RND)
-  NODE_NAME_CASE(VFMADDCSH)
-  NODE_NAME_CASE(VFMADDCSH_RND)
-  NODE_NAME_CASE(VFCMADDCSH)
-  NODE_NAME_CASE(VFCMADDCSH_RND)
-  NODE_NAME_CASE(VPMADD52H)
-  NODE_NAME_CASE(VPMADD52L)
-  NODE_NAME_CASE(VRNDSCALE)
-  NODE_NAME_CASE(STRICT_VRNDSCALE)
-  NODE_NAME_CASE(VRNDSCALE_SAE)
-  NODE_NAME_CASE(VRNDSCALES)
-  NODE_NAME_CASE(VRNDSCALES_SAE)
-  NODE_NAME_CASE(VREDUCE)
-  NODE_NAME_CASE(VREDUCE_SAE)
-  NODE_NAME_CASE(VREDUCES)
-  NODE_NAME_CASE(VREDUCES_SAE)
-  NODE_NAME_CASE(VGETMANT)
-  NODE_NAME_CASE(VGETMANT_SAE)
-  NODE_NAME_CASE(VGETMANTS)
-  NODE_NAME_CASE(VGETMANTS_SAE)
-  NODE_NAME_CASE(PCMPESTR)
-  NODE_NAME_CASE(PCMPISTR)
-  NODE_NAME_CASE(XTEST)
-  NODE_NAME_CASE(COMPRESS)
-  NODE_NAME_CASE(EXPAND)
-  NODE_NAME_CASE(SELECTS)
-  NODE_NAME_CASE(ADDSUB)
-  NODE_NAME_CASE(RCP14)
-  NODE_NAME_CASE(RCP14S)
-  NODE_NAME_CASE(RSQRT14)
-  NODE_NAME_CASE(RSQRT14S)
-  NODE_NAME_CASE(FADD_RND)
-  NODE_NAME_CASE(FADDS)
-  NODE_NAME_CASE(FADDS_RND)
-  NODE_NAME_CASE(FSUB_RND)
-  NODE_NAME_CASE(FSUBS)
-  NODE_NAME_CASE(FSUBS_RND)
-  NODE_NAME_CASE(FMUL_RND)
-  NODE_NAME_CASE(FMULS)
-  NODE_NAME_CASE(FMULS_RND)
-  NODE_NAME_CASE(FDIV_RND)
-  NODE_NAME_CASE(FDIVS)
-  NODE_NAME_CASE(FDIVS_RND)
-  NODE_NAME_CASE(FSQRT_RND)
-  NODE_NAME_CASE(FSQRTS)
-  NODE_NAME_CASE(FSQRTS_RND)
-  NODE_NAME_CASE(FGETEXP)
-  NODE_NAME_CASE(FGETEXP_SAE)
-  NODE_NAME_CASE(FGETEXPS)
-  NODE_NAME_CASE(FGETEXPS_SAE)
-  NODE_NAME_CASE(SCALEF)
-  NODE_NAME_CASE(SCALEF_RND)
-  NODE_NAME_CASE(SCALEFS)
-  NODE_NAME_CASE(SCALEFS_RND)
-  NODE_NAME_CASE(MULHRS)
-  NODE_NAME_CASE(SINT_TO_FP_RND)
-  NODE_NAME_CASE(UINT_TO_FP_RND)
-  NODE_NAME_CASE(CVTTP2SI)
-  NODE_NAME_CASE(CVTTP2UI)
-  NODE_NAME_CASE(STRICT_CVTTP2SI)
-  NODE_NAME_CASE(STRICT_CVTTP2UI)
-  NODE_NAME_CASE(MCVTTP2SI)
-  NODE_NAME_CASE(MCVTTP2UI)
-  NODE_NAME_CASE(CVTTP2SI_SAE)
-  NODE_NAME_CASE(CVTTP2UI_SAE)
-  NODE_NAME_CASE(CVTTS2SI)
-  NODE_NAME_CASE(CVTTS2UI)
-  NODE_NAME_CASE(CVTTS2SI_SAE)
-  NODE_NAME_CASE(CVTTS2UI_SAE)
-  NODE_NAME_CASE(CVTSI2P)
-  NODE_NAME_CASE(CVTUI2P)
-  NODE_NAME_CASE(STRICT_CVTSI2P)
-  NODE_NAME_CASE(STRICT_CVTUI2P)
-  NODE_NAME_CASE(MCVTSI2P)
-  NODE_NAME_CASE(MCVTUI2P)
-  NODE_NAME_CASE(VFPCLASS)
-  NODE_NAME_CASE(VFPCLASSS)
-  NODE_NAME_CASE(MULTISHIFT)
-  NODE_NAME_CASE(SCALAR_SINT_TO_FP)
-  NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
-  NODE_NAME_CASE(SCALAR_UINT_TO_FP)
-  NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
-  NODE_NAME_CASE(CVTPS2PH)
-  NODE_NAME_CASE(STRICT_CVTPS2PH)
-  NODE_NAME_CASE(CVTPS2PH_SAE)
-  NODE_NAME_CASE(MCVTPS2PH)
-  NODE_NAME_CASE(MCVTPS2PH_SAE)
-  NODE_NAME_CASE(CVTPH2PS)
-  NODE_NAME_CASE(STRICT_CVTPH2PS)
-  NODE_NAME_CASE(CVTPH2PS_SAE)
-  NODE_NAME_CASE(CVTP2SI)
-  NODE_NAME_CASE(CVTP2UI)
-  NODE_NAME_CASE(MCVTP2SI)
-  NODE_NAME_CASE(MCVTP2UI)
-  NODE_NAME_CASE(CVTP2SI_RND)
-  NODE_NAME_CASE(CVTP2UI_RND)
-  NODE_NAME_CASE(CVTS2SI)
-  NODE_NAME_CASE(CVTS2UI)
-  NODE_NAME_CASE(CVTS2SI_RND)
-  NODE_NAME_CASE(CVTS2UI_RND)
-  NODE_NAME_CASE(CVTNEPS2BF16)
-  NODE_NAME_CASE(MCVTNEPS2BF16)
-  NODE_NAME_CASE(DPBF16PS)
-  NODE_NAME_CASE(DPFP16PS)
-  NODE_NAME_CASE(MPSADBW)
-  NODE_NAME_CASE(LWPINS)
-  NODE_NAME_CASE(MGATHER)
-  NODE_NAME_CASE(MSCATTER)
-  NODE_NAME_CASE(VPDPBUSD)
-  NODE_NAME_CASE(VPDPBUSDS)
-  NODE_NAME_CASE(VPDPWSSD)
-  NODE_NAME_CASE(VPDPWSSDS)
-  NODE_NAME_CASE(VPSHUFBITQMB)
-  NODE_NAME_CASE(GF2P8MULB)
-  NODE_NAME_CASE(GF2P8AFFINEQB)
-  NODE_NAME_CASE(GF2P8AFFINEINVQB)
-  NODE_NAME_CASE(NT_CALL)
-  NODE_NAME_CASE(NT_BRIND)
-  NODE_NAME_CASE(UMWAIT)
-  NODE_NAME_CASE(TPAUSE)
-  NODE_NAME_CASE(ENQCMD)
-  NODE_NAME_CASE(ENQCMDS)
-  NODE_NAME_CASE(VP2INTERSECT)
-  NODE_NAME_CASE(VPDPBSUD)
-  NODE_NAME_CASE(VPDPBSUDS)
-  NODE_NAME_CASE(VPDPBUUD)
-  NODE_NAME_CASE(VPDPBUUDS)
-  NODE_NAME_CASE(VPDPBSSD)
-  NODE_NAME_CASE(VPDPBSSDS)
-  NODE_NAME_CASE(VPDPWSUD)
-  NODE_NAME_CASE(VPDPWSUDS)
-  NODE_NAME_CASE(VPDPWUSD)
-  NODE_NAME_CASE(VPDPWUSDS)
-  NODE_NAME_CASE(VPDPWUUD)
-  NODE_NAME_CASE(VPDPWUUDS)
-  NODE_NAME_CASE(VMINMAX)
-  NODE_NAME_CASE(VMINMAX_SAE)
-  NODE_NAME_CASE(VMINMAXS)
-  NODE_NAME_CASE(VMINMAXS_SAE)
-  NODE_NAME_CASE(CVTP2IBS)
-  NODE_NAME_CASE(CVTP2IUBS)
-  NODE_NAME_CASE(CVTP2IBS_RND)
-  NODE_NAME_CASE(CVTP2IUBS_RND)
-  NODE_NAME_CASE(CVTTP2IBS)
-  NODE_NAME_CASE(CVTTP2IUBS)
-  NODE_NAME_CASE(CVTTP2IBS_SAE)
-  NODE_NAME_CASE(CVTTP2IUBS_SAE)
-  NODE_NAME_CASE(VCVT2PH2BF8)
-  NODE_NAME_CASE(VCVT2PH2BF8S)
-  NODE_NAME_CASE(VCVT2PH2HF8)
-  NODE_NAME_CASE(VCVT2PH2HF8S)
-  NODE_NAME_CASE(VCVTBIASPH2BF8)
-  NODE_NAME_CASE(VCVTBIASPH2BF8S)
-  NODE_NAME_CASE(VCVTBIASPH2HF8)
-  NODE_NAME_CASE(VCVTBIASPH2HF8S)
-  NODE_NAME_CASE(VCVTPH2BF8)
-  NODE_NAME_CASE(VCVTPH2BF8S)
-  NODE_NAME_CASE(VCVTPH2HF8)
-  NODE_NAME_CASE(VCVTPH2HF8S)
-  NODE_NAME_CASE(VMCVTBIASPH2BF8)
-  NODE_NAME_CASE(VMCVTBIASPH2BF8S)
-  NODE_NAME_CASE(VMCVTBIASPH2HF8)
-  NODE_NAME_CASE(VMCVTBIASPH2HF8S)
-  NODE_NAME_CASE(VMCVTPH2BF8)
-  NODE_NAME_CASE(VMCVTPH2BF8S)
-  NODE_NAME_CASE(VMCVTPH2HF8)
-  NODE_NAME_CASE(VMCVTPH2HF8S)
-  NODE_NAME_CASE(VCVTHF82PH)
-  NODE_NAME_CASE(AESENC128KL)
-  NODE_NAME_CASE(AESDEC128KL)
-  NODE_NAME_CASE(AESENC256KL)
-  NODE_NAME_CASE(AESDEC256KL)
-  NODE_NAME_CASE(AESENCWIDE128KL)
-  NODE_NAME_CASE(AESDECWIDE128KL)
-  NODE_NAME_CASE(AESENCWIDE256KL)
-  NODE_NAME_CASE(AESDECWIDE256KL)
-  NODE_NAME_CASE(CMPCCXADD)
-  NODE_NAME_CASE(TESTUI)
-  NODE_NAME_CASE(FP80_ADD)
-  NODE_NAME_CASE(STRICT_FP80_ADD)
-  NODE_NAME_CASE(CCMP)
-  NODE_NAME_CASE(CTEST)
-  NODE_NAME_CASE(CLOAD)
-  NODE_NAME_CASE(CSTORE)
-  NODE_NAME_CASE(CVTTS2SIS)
-  NODE_NAME_CASE(CVTTS2UIS)
-  NODE_NAME_CASE(CVTTS2SIS_SAE)
-  NODE_NAME_CASE(CVTTS2UIS_SAE)
-  NODE_NAME_CASE(CVTTP2SIS)
-  NODE_NAME_CASE(MCVTTP2SIS)
-  NODE_NAME_CASE(CVTTP2UIS_SAE)
-  NODE_NAME_CASE(CVTTP2SIS_SAE)
-  NODE_NAME_CASE(CVTTP2UIS)
-  NODE_NAME_CASE(MCVTTP2UIS)
-  NODE_NAME_CASE(POP_FROM_X87_REG)
+  case X86ISD::FIRST_NUMBER:
+    break;
+#define NODE_NAME_CASE(NODE)                                                   \
+  case X86ISD::NODE:                                                           \
+    return "X86ISD::" #NODE;
+    NODE_NAME_CASE(BSF)
+    NODE_NAME_CASE(BSR)
+    NODE_NAME_CASE(FSHL)
+    NODE_NAME_CASE(FSHR)
+    NODE_NAME_CASE(FAND)
+    NODE_NAME_CASE(FANDN)
+    NODE_NAME_CASE(FOR)
+    NODE_NAME_CASE(FXOR)
+    NODE_NAME_CASE(FILD)
+    NODE_NAME_CASE(FIST)
+    NODE_NAME_CASE(FP_TO_INT_IN_MEM)
+    NODE_NAME_CASE(FLD)
+    NODE_NAME_CASE(FST)
+    NODE_NAME_CASE(CALL)
+    NODE_NAME_CASE(CALL_RVMARKER)
+    NODE_NAME_CASE(IMP_CALL)
+    NODE_NAME_CASE(BT)
+    NODE_NAME_CASE(CMP)
+    NODE_NAME_CASE(FCMP)
+    NODE_NAME_CASE(STRICT_FCMP)
+    NODE_NAME_CASE(STRICT_FCMPS)
+    NODE_NAME_CASE(COMI)
+    NODE_NAME_CASE(UCOMI)
+    NODE_NAME_CASE(COMX)
+    NODE_NAME_CASE(UCOMX)
+    NODE_NAME_CASE(CMPM)
+    NODE_NAME_CASE(CMPMM)
+    NODE_NAME_CASE(STRICT_CMPM)
+    NODE_NAME_CASE(CMPMM_SAE)
+    NODE_NAME_CASE(SETCC)
+    NODE_NAME_CASE(SETCC_CARRY)
+    NODE_NAME_CASE(FSETCC)
+    NODE_NAME_CASE(FSETCCM)
+    NODE_NAME_CASE(FSETCCM_SAE)
+    NODE_NAME_CASE(CMOV)
+    NODE_NAME_CASE(BRCOND)
+    NODE_NAME_CASE(RET_GLUE)
+    NODE_NAME_CASE(IRET)
+    NODE_NAME_CASE(REP_STOS)
+    NODE_NAME_CASE(REP_MOVS)
+    NODE_NAME_CASE(GlobalBaseReg)
+    NODE_NAME_CASE(Wrapper)
+    NODE_NAME_CASE(WrapperRIP)
+    NODE_NAME_CASE(MOVQ2DQ)
+    NODE_NAME_CASE(MOVDQ2Q)
+    NODE_NAME_CASE(MMX_MOVD2W)
+    NODE_NAME_CASE(MMX_MOVW2D)
+    NODE_NAME_CASE(PEXTRB)
+    NODE_NAME_CASE(PEXTRW)
+    NODE_NAME_CASE(INSERTPS)
+    NODE_NAME_CASE(PINSRB)
+    NODE_NAME_CASE(PINSRW)
+    NODE_NAME_CASE(PSHUFB)
+    NODE_NAME_CASE(ANDNP)
+    NODE_NAME_CASE(BLENDI)
+    NODE_NAME_CASE(BLENDV)
+    NODE_NAME_CASE(HADD)
+    NODE_NAME_CASE(HSUB)
+    NODE_NAME_CASE(FHADD)
+    NODE_NAME_CASE(FHSUB)
+    NODE_NAME_CASE(CONFLICT)
+    NODE_NAME_CASE(FMAX)
+    NODE_NAME_CASE(FMAXS)
+    NODE_NAME_CASE(FMAX_SAE)
+    NODE_NAME_CASE(FMAXS_SAE)
+    NODE_NAME_CASE(STRICT_FMAX)
+    NODE_NAME_CASE(FMIN)
+    NODE_NAME_CASE(FMINS)
+    NODE_NAME_CASE(FMIN_SAE)
+    NODE_NAME_CASE(FMINS_SAE)
+    NODE_NAME_CASE(STRICT_FMIN)
+    NODE_NAME_CASE(FMAXC)
+    NODE_NAME_CASE(FMINC)
+    NODE_NAME_CASE(FRSQRT)
+    NODE_NAME_CASE(FRCP)
+    NODE_NAME_CASE(EXTRQI)
+    NODE_NAME_CASE(INSERTQI)
+    NODE_NAME_CASE(TLSADDR)
+    NODE_NAME_CASE(TLSBASEADDR)
+    NODE_NAME_CASE(TLSCALL)
+    NODE_NAME_CASE(TLSDESC)
+    NODE_NAME_CASE(EH_SJLJ_SETJMP)
+    NODE_NAME_CASE(EH_SJLJ_LONGJMP)
+    NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
+    NODE_NAME_CASE(EH_RETURN)
+    NODE_NAME_CASE(TC_RETURN)
+    NODE_NAME_CASE(FNSTCW16m)
+    NODE_NAME_CASE(FLDCW16m)
+    NODE_NAME_CASE(FNSTENVm)
+    NODE_NAME_CASE(FLDENVm)
+    NODE_NAME_CASE(LCMPXCHG_DAG)
+    NODE_NAME_CASE(LCMPXCHG8_DAG)
+    NODE_NAME_CASE(LCMPXCHG16_DAG)
+    NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
+    NODE_NAME_CASE(LADD)
+    NODE_NAME_CASE(LSUB)
+    NODE_NAME_CASE(LOR)
+    NODE_NAME_CASE(LXOR)
+    NODE_NAME_CASE(LAND)
+    NODE_NAME_CASE(LBTS)
+    NODE_NAME_CASE(LBTC)
+    NODE_NAME_CASE(LBTR)
+    NODE_NAME_CASE(LBTS_RM)
+    NODE_NAME_CASE(LBTC_RM)
+    NODE_NAME_CASE(LBTR_RM)
+    NODE_NAME_CASE(AADD)
+    NODE_NAME_CASE(AOR)
+    NODE_NAME_CASE(AXOR)
+    NODE_NAME_CASE(AAND)
+    NODE_NAME_CASE(VZEXT_MOVL)
+    NODE_NAME_CASE(VZEXT_LOAD)
+    NODE_NAME_CASE(VEXTRACT_STORE)
+    NODE_NAME_CASE(VTRUNC)
+    NODE_NAME_CASE(VTRUNCS)
+    NODE_NAME_CASE(VTRUNCUS)
+    NODE_NAME_CASE(VMTRUNC)
+    NODE_NAME_CASE(VMTRUNCS)
+    NODE_NAME_CASE(VMTRUNCUS)
+    NODE_NAME_CASE(VTRUNCSTORES)
+    NODE_NAME_CASE(VTRUNCSTOREUS)
+    NODE_NAME_CASE(VMTRUNCSTORES)
+    NODE_NAME_CASE(VMTRUNCSTOREUS)
+    NODE_NAME_CASE(VFPEXT)
+    NODE_NAME_CASE(STRICT_VFPEXT)
+    NODE_NAME_CASE(VFPEXT_SAE)
+    NODE_NAME_CASE(VFPEXTS)
+    NODE_NAME_CASE(VFPEXTS_SAE)
+    NODE_NAME_CASE(VFPROUND)
+    NODE_NAME_CASE(VFPROUND2)
+    NODE_NAME_CASE(VFPROUND2_RND)
+    NODE_NAME_CASE(STRICT_VFPROUND)
+    NODE_NAME_CASE(VMFPROUND)
+    NODE_NAME_CASE(VFPROUND_RND)
+    NODE_NAME_CASE(VFPROUNDS)
+    NODE_NAME_CASE(VFPROUNDS_RND)
+    NODE_NAME_CASE(VSHLDQ)
+    NODE_NAME_CASE(VSRLDQ)
+    NODE_NAME_CASE(VSHL)
+    NODE_NAME_CASE(VSRL)
+    NODE_NAME_CASE(VSRA)
+    NODE_NAME_CASE(VSHLI)
+    NODE_NAME_CASE(VSRLI)
+    NODE_NAME_CASE(VSRAI)
+    NODE_NAME_CASE(VSHLV)
+    NODE_NAME_CASE(VSRLV)
+    NODE_NAME_CASE(VSRAV)
+    NODE_NAME_CASE(VROTLI)
+    NODE_NAME_CASE(VROTRI)
+    NODE_NAME_CASE(VPPERM)
+    NODE_NAME_CASE(CMPP)
+    NODE_NAME_CASE(STRICT_CMPP)
+    NODE_NAME_CASE(PCMPEQ)
+    NODE_NAME_CASE(PCMPGT)
+    NODE_NAME_CASE(PHMINPOS)
+    NODE_NAME_CASE(ADD)
+    NODE_NAME_CASE(SUB)
+    NODE_NAME_CASE(ADC)
+    NODE_NAME_CASE(SBB)
+    NODE_NAME_CASE(SMUL)
+    NODE_NAME_CASE(UMUL)
+    NODE_NAME_CASE(OR)
+    NODE_NAME_CASE(XOR)
+    NODE_NAME_CASE(AND)
+    NODE_NAME_CASE(BEXTR)
+    NODE_NAME_CASE(BEXTRI)
+    NODE_NAME_CASE(BZHI)
+    NODE_NAME_CASE(PDEP)
+    NODE_NAME_CASE(PEXT)
+    NODE_NAME_CASE(MUL_IMM)
+    NODE_NAME_CASE(MOVMSK)
+    NODE_NAME_CASE(PTEST)
+    NODE_NAME_CASE(TESTP)
+    NODE_NAME_CASE(KORTEST)
+    NODE_NAME_CASE(KTEST)
+    NODE_NAME_CASE(KADD)
+    NODE_NAME_CASE(KSHIFTL)
+    NODE_NAME_CASE(KSHIFTR)
+    NODE_NAME_CASE(PACKSS)
+    NODE_NAME_CASE(PACKUS)
+    NODE_NAME_CASE(PALIGNR)
+    NODE_NAME_CASE(VALIGN)
+    NODE_NAME_CASE(VSHLD)
+    NODE_NAME_CASE(VSHRD)
+    NODE_NAME_CASE(VSHLDV)
+    NODE_NAME_CASE(VSHRDV)
+    NODE_NAME_CASE(PSHUFD)
+    NODE_NAME_CASE(PSHUFHW)
+    NODE_NAME_CASE(PSHUFLW)
+    NODE_NAME_CASE(SHUFP)
+    NODE_NAME_CASE(SHUF128)
+    NODE_NAME_CASE(MOVLHPS)
+    NODE_NAME_CASE(MOVHLPS)
+    NODE_NAME_CASE(MOVDDUP)
+    NODE_NAME_CASE(MOVSHDUP)
+    NODE_NAME_CASE(MOVSLDUP)
+    NODE_NAME_CASE(MOVSD)
+    NODE_NAME_CASE(MOVSS)
+    NODE_NAME_CASE(MOVSH)
+    NODE_NAME_CASE(UNPCKL)
+    NODE_NAME_CASE(UNPCKH)
+    NODE_NAME_CASE(VBROADCAST)
+    NODE_NAME_CASE(VBROADCAST_LOAD)
+    NODE_NAME_CASE(VBROADCASTM)
+    NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
+    NODE_NAME_CASE(VPERMILPV)
+    NODE_NAME_CASE(VPERMILPI)
+    NODE_NAME_CASE(VPERM2X128)
+    NODE_NAME_CASE(VPERMV)
+    NODE_NAME_CASE(VPERMV3)
+    NODE_NAME_CASE(VPERMI)
+    NODE_NAME_CASE(VPTERNLOG)
+    NODE_NAME_CASE(FP_TO_SINT_SAT)
+    NODE_NAME_CASE(FP_TO_UINT_SAT)
+    NODE_NAME_CASE(VFIXUPIMM)
+    NODE_NAME_CASE(VFIXUPIMM_SAE)
+    NODE_NAME_CASE(VFIXUPIMMS)
+    NODE_NAME_CASE(VFIXUPIMMS_SAE)
+    NODE_NAME_CASE(VRANGE)
+    NODE_NAME_CASE(VRANGE_SAE)
+    NODE_NAME_CASE(VRANGES)
+    NODE_NAME_CASE(VRANGES_SAE)
+    NODE_NAME_CASE(PMULUDQ)
+    NODE_NAME_CASE(PMULDQ)
+    NODE_NAME_CASE(PSADBW)
+    NODE_NAME_CASE(DBPSADBW)
+    NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
+    NODE_NAME_CASE(VAARG_64)
+    NODE_NAME_CASE(VAARG_X32)
+    NODE_NAME_CASE(DYN_ALLOCA)
+    NODE_NAME_CASE(MFENCE)
+    NODE_NAME_CASE(SEG_ALLOCA)
+    NODE_NAME_CASE(PROBED_ALLOCA)
+    NODE_NAME_CASE(RDRAND)
+    NODE_NAME_CASE(RDSEED)
+    NODE_NAME_CASE(RDPKRU)
+    NODE_NAME_CASE(WRPKRU)
+    NODE_NAME_CASE(VPMADDUBSW)
+    NODE_NAME_CASE(VPMADDWD)
+    NODE_NAME_CASE(VPSHA)
+    NODE_NAME_CASE(VPSHL)
+    NODE_NAME_CASE(VPCOM)
+    NODE_NAME_CASE(VPCOMU)
+    NODE_NAME_CASE(VPERMIL2)
+    NODE_NAME_CASE(FMSUB)
+    NODE_NAME_CASE(STRICT_FMSUB)
+    NODE_NAME_CASE(FNMADD)
+    NODE_NAME_CASE(STRICT_FNMADD)
+    NODE_NAME_CASE(FNMSUB)
+    NODE_NAME_CASE(STRICT_FNMSUB)
+    NODE_NAME_CASE(FMADDSUB)
+    NODE_NAME_CASE(FMSUBADD)
+    NODE_NAME_CASE(FMADD_RND)
+    NODE_NAME_CASE(FNMADD_RND)
+    NODE_NAME_CASE(FMSUB_RND)
+    NODE_NAME_CASE(FNMSUB_RND)
+    NODE_NAME_CASE(FMADDSUB_RND)
+    NODE_NAME_CASE(FMSUBADD_RND)
+    NODE_NAME_CASE(VFMADDC)
+    NODE_NAME_CASE(VFMADDC_RND)
+    NODE_NAME_CASE(VFCMADDC)
+    NODE_NAME_CASE(VFCMADDC_RND)
+    NODE_NAME_CASE(VFMULC)
+    NODE_NAME_CASE(VFMULC_RND)
+    NODE_NAME_CASE(VFCMULC)
+    NODE_NAME_CASE(VFCMULC_RND)
+    NODE_NAME_CASE(VFMULCSH)
+    NODE_NAME_CASE(VFMULCSH_RND)
+    NODE_NAME_CASE(VFCMULCSH)
+    NODE_NAME_CASE(VFCMULCSH_RND)
+    NODE_NAME_CASE(VFMADDCSH)
+    NODE_NAME_CASE(VFMADDCSH_RND)
+    NODE_NAME_CASE(VFCMADDCSH)
+    NODE_NAME_CASE(VFCMADDCSH_RND)
+    NODE_NAME_CASE(VPMADD52H)
+    NODE_NAME_CASE(VPMADD52L)
+    NODE_NAME_CASE(VRNDSCALE)
+    NODE_NAME_CASE(STRICT_VRNDSCALE)
+    NODE_NAME_CASE(VRNDSCALE_SAE)
+    NODE_NAME_CASE(VRNDSCALES)
+    NODE_NAME_CASE(VRNDSCALES_SAE)
+    NODE_NAME_CASE(VREDUCE)
+    NODE_NAME_CASE(VREDUCE_SAE)
+    NODE_NAME_CASE(VREDUCES)
+    NODE_NAME_CASE(VREDUCES_SAE)
+    NODE_NAME_CASE(VGETMANT)
+    NODE_NAME_CASE(VGETMANT_SAE)
+    NODE_NAME_CASE(VGETMANTS)
+    NODE_NAME_CASE(VGETMANTS_SAE)
+    NODE_NAME_CASE(PCMPESTR)
+    NODE_NAME_CASE(PCMPISTR)
+    NODE_NAME_CASE(XTEST)
+    NODE_NAME_CASE(COMPRESS)
+    NODE_NAME_CASE(EXPAND)
+    NODE_NAME_CASE(SELECTS)
+    NODE_NAME_CASE(ADDSUB)
+    NODE_NAME_CASE(RCP14)
+    NODE_NAME_CASE(RCP14S)
+    NODE_NAME_CASE(RSQRT14)
+    NODE_NAME_CASE(RSQRT14S)
+    NODE_NAME_CASE(FADD_RND)
+    NODE_NAME_CASE(FADDS)
+    NODE_NAME_CASE(FADDS_RND)
+    NODE_NAME_CASE(FSUB_RND)
+    NODE_NAME_CASE(FSUBS)
+    NODE_NAME_CASE(FSUBS_RND)
+    NODE_NAME_CASE(FMUL_RND)
+    NODE_NAME_CASE(FMULS)
+    NODE_NAME_CASE(FMULS_RND)
+    NODE_NAME_CASE(FDIV_RND)
+    NODE_NAME_CASE(FDIVS)
+    NODE_NAME_CASE(FDIVS_RND)
+    NODE_NAME_CASE(FSQRT_RND)
+    NODE_NAME_CASE(FSQRTS)
+    NODE_NAME_CASE(FSQRTS_RND)
+    NODE_NAME_CASE(FGETEXP)
+    NODE_NAME_CASE(FGETEXP_SAE)
+    NODE_NAME_CASE(FGETEXPS)
+    NODE_NAME_CASE(FGETEXPS_SAE)
+    NODE_NAME_CASE(SCALEF)
+    NODE_NAME_CASE(SCALEF_RND)
+    NODE_NAME_CASE(SCALEFS)
+    NODE_NAME_CASE(SCALEFS_RND)
+    NODE_NAME_CASE(MULHRS)
+    NODE_NAME_CASE(SINT_TO_FP_RND)
+    NODE_NAME_CASE(UINT_TO_FP_RND)
+    NODE_NAME_CASE(CVTTP2SI)
+    NODE_NAME_CASE(CVTTP2UI)
+    NODE_NAME_CASE(STRICT_CVTTP2SI)
+    NODE_NAME_CASE(STRICT_CVTTP2UI)
+    NODE_NAME_CASE(MCVTTP2SI)
+    NODE_NAME_CASE(MCVTTP2UI)
+    NODE_NAME_CASE(CVTTP2SI_SAE)
+    NODE_NAME_CASE(CVTTP2UI_SAE)
+    NODE_NAME_CASE(CVTTS2SI)
+    NODE_NAME_CASE(CVTTS2UI)
+    NODE_NAME_CASE(CVTTS2SI_SAE)
+    NODE_NAME_CASE(CVTTS2UI_SAE)
+    NODE_NAME_CASE(CVTSI2P)
+    NODE_NAME_CASE(CVTUI2P)
+    NODE_NAME_CASE(STRICT_CVTSI2P)
+    NODE_NAME_CASE(STRICT_CVTUI2P)
+    NODE_NAME_CASE(MCVTSI2P)
+    NODE_NAME_CASE(MCVTUI2P)
+    NODE_NAME_CASE(VFPCLASS)
+    NODE_NAME_CASE(VFPCLASSS)
+    NODE_NAME_CASE(MULTISHIFT)
+    NODE_NAME_CASE(SCALAR_SINT_TO_FP)
+    NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
+    NODE_NAME_CASE(SCALAR_UINT_TO_FP)
+    NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
+    NODE_NAME_CASE(CVTPS2PH)
+    NODE_NAME_CASE(STRICT_CVTPS2PH)
+    NODE_NAME_CASE(CVTPS2PH_SAE)
+    NODE_NAME_CASE(MCVTPS2PH)
+    NODE_NAME_CASE(MCVTPS2PH_SAE)
+    NODE_NAME_CASE(CVTPH2PS)
+    NODE_NAME_CASE(STRICT_CVTPH2PS)
+    NODE_NAME_CASE(CVTPH2PS_SAE)
+    NODE_NAME_CASE(CVTP2SI)
+    NODE_NAME_CASE(CVTP2UI)
+    NODE_NAME_CASE(MCVTP2SI)
+    NODE_NAME_CASE(MCVTP2UI)
+    NODE_NAME_CASE(CVTP2SI_RND)
+    NODE_NAME_CASE(CVTP2UI_RND)
+    NODE_NAME_CASE(CVTS2SI)
+    NODE_NAME_CASE(CVTS2UI)
+    NODE_NAME_CASE(CVTS2SI_RND)
+    NODE_NAME_CASE(CVTS2UI_RND)
+    NODE_NAME_CASE(CVTNEPS2BF16)
+    NODE_NAME_CASE(MCVTNEPS2BF16)
+    NODE_NAME_CASE(DPBF16PS)
+    NODE_NAME_CASE(DPFP16PS)
+    NODE_NAME_CASE(MPSADBW)
+    NODE_NAME_CASE(LWPINS)
+    NODE_NAME_CASE(MGATHER)
+    NODE_NAME_CASE(MSCATTER)
+    NODE_NAME_CASE(VPDPBUSD)
+    NODE_NAME_CASE(VPDPBUSDS)
+    NODE_NAME_CASE(VPDPWSSD)
+    NODE_NAME_CASE(VPDPWSSDS)
+    NODE_NAME_CASE(VPSHUFBITQMB)
+    NODE_NAME_CASE(GF2P8MULB)
+    NODE_NAME_CASE(GF2P8AFFINEQB)
+    NODE_NAME_CASE(GF2P8AFFINEINVQB)
+    NODE_NAME_CASE(NT_CALL)
+    NODE_NAME_CASE(NT_BRIND)
+    NODE_NAME_CASE(UMWAIT)
+    NODE_NAME_CASE(TPAUSE)
+    NODE_NAME_CASE(ENQCMD)
+    NODE_NAME_CASE(ENQCMDS)
+    NODE_NAME_CASE(VP2INTERSECT)
+    NODE_NAME_CASE(VPDPBSUD)
+    NODE_NAME_CASE(VPDPBSUDS)
+    NODE_NAME_CASE(VPDPBUUD)
+    NODE_NAME_CASE(VPDPBUUDS)
+    NODE_NAME_CASE(VPDPBSSD)
+    NODE_NAME_CASE(VPDPBSSDS)
+    NODE_NAME_CASE(VPDPWSUD)
+    NODE_NAME_CASE(VPDPWSUDS)
+    NODE_NAME_CASE(VPDPWUSD)
+    NODE_NAME_CASE(VPDPWUSDS)
+    NODE_NAME_CASE(VPDPWUUD)
+    NODE_NAME_CASE(VPDPWUUDS)
+    NODE_NAME_CASE(VMINMAX)
+    NODE_NAME_CASE(VMINMAX_SAE)
+    NODE_NAME_CASE(VMINMAXS)
+    NODE_NAME_CASE(VMINMAXS_SAE)
+    NODE_NAME_CASE(CVTP2IBS)
+    NODE_NAME_CASE(CVTP2IUBS)
+    NODE_NAME_CASE(CVTP2IBS_RND)
+    NODE_NAME_CASE(CVTP2IUBS_RND)
+    NODE_NAME_CASE(CVTTP2IBS)
+    NODE_NAME_CASE(CVTTP2IUBS)
+    NODE_NAME_CASE(CVTTP2IBS_SAE)
+    NODE_NAME_CASE(CVTTP2IUBS_SAE)
+    NODE_NAME_CASE(VCVT2PH2BF8)
+    NODE_NAME_CASE(VCVT2PH2BF8S)
+    NODE_NAME_CASE(VCVT2PH2HF8)
+    NODE_NAME_CASE(VCVT2PH2HF8S)
+    NODE_NAME_CASE(VCVTBIASPH2BF8)
+    NODE_NAME_CASE(VCVTBIASPH2BF8S)
+    NODE_NAME_CASE(VCVTBIASPH2HF8)
+    NODE_NAME_CASE(VCVTBIASPH2HF8S)
+    NODE_NAME_CASE(VCVTPH2BF8)
+    NODE_NAME_CASE(VCVTPH2BF8S)
+    NODE_NAME_CASE(VCVTPH2HF8)
+    NODE_NAME_CASE(VCVTPH2HF8S)
+    NODE_NAME_CASE(VMCVTBIASPH2BF8)
+    NODE_NAME_CASE(VMCVTBIASPH2BF8S)
+    NODE_NAME_CASE(VMCVTBIASPH2HF8)
+    NODE_NAME_CASE(VMCVTBIASPH2HF8S)
+    NODE_NAME_CASE(VMCVTPH2BF8)
+    NODE_NAME_CASE(VMCVTPH2BF8S)
+    NODE_NAME_CASE(VMCVTPH2HF8)
+    NODE_NAME_CASE(VMCVTPH2HF8S)
+    NODE_NAME_CASE(VCVTHF82PH)
+    NODE_NAME_CASE(AESENC128KL)
+    NODE_NAME_CASE(AESDEC128KL)
+    NODE_NAME_CASE(AESENC256KL)
+    NODE_NAME_CASE(AESDEC256KL)
+    NODE_NAME_CASE(AESENCWIDE128KL)
+    NODE_NAME_CASE(AESDECWIDE128KL)
+    NODE_NAME_CASE(AESENCWIDE256KL)
+    NODE_NAME_CASE(AESDECWIDE256KL)
+    NODE_NAME_CASE(CMPCCXADD)
+    NODE_NAME_CASE(TESTUI)
+    NODE_NAME_CASE(FP80_ADD)
+    NODE_NAME_CASE(STRICT_FP80_ADD)
+    NODE_NAME_CASE(CCMP)
+    NODE_NAME_CASE(CTEST)
+    NODE_NAME_CASE(CLOAD)
+    NODE_NAME_CASE(CSTORE)
+    NODE_NAME_CASE(CVTTS2SIS)
+    NODE_NAME_CASE(CVTTS2UIS)
+    NODE_NAME_CASE(CVTTS2SIS_SAE)
+    NODE_NAME_CASE(CVTTS2UIS_SAE)
+    NODE_NAME_CASE(CVTTP2SIS)
+    NODE_NAME_CASE(MCVTTP2SIS)
+    NODE_NAME_CASE(CVTTP2UIS_SAE)
+    NODE_NAME_CASE(CVTTP2SIS_SAE)
+    NODE_NAME_CASE(CVTTP2UIS)
+    NODE_NAME_CASE(MCVTTP2UIS)
+    NODE_NAME_CASE(POP_FROM_X87_REG)
   }
   return nullptr;
 #undef NODE_NAME_CASE
@@ -35377,7 +35422,7 @@ bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
     if (AM.HasBaseReg)
       return false;
     break;
-  default:  // Other stuff never works.
+  default: // Other stuff never works.
     return false;
   }
 
@@ -35482,12 +35527,13 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   if (Val.getOpcode() != ISD::LOAD)
     return false;
 
-  if (!VT1.isSimple() || !VT1.isInteger() ||
-      !VT2.isSimple() || !VT2.isInteger())
+  if (!VT1.isSimple() || !VT1.isInteger() || !VT2.isSimple() ||
+      !VT2.isInteger())
     return false;
 
   switch (VT1.getSimpleVT().SimpleTy) {
-  default: break;
+  default:
+    break;
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
@@ -35694,8 +35740,10 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
   // sinkMBB:
   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
   BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
-      .addReg(mainDstReg).addMBB(mainMBB)
-      .addReg(fallDstReg).addMBB(fallMBB);
+      .addReg(mainDstReg)
+      .addMBB(mainMBB)
+      .addReg(fallDstReg)
+      .addMBB(fallMBB);
 
   MI.eraseFromParent();
   return sinkMBB;
@@ -35761,8 +35809,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
   unsigned TotalNumXMMRegs = 8;
   bool UseGPOffset = (ArgMode == 1);
   bool UseFPOffset = (ArgMode == 2);
-  unsigned MaxOffset = TotalNumIntRegs * 8 +
-                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+  unsigned MaxOffset =
+      TotalNumIntRegs * 8 + (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
 
   /* Align ArgSize to a multiple of 8 */
   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
@@ -35840,13 +35888,14 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
 
     // Check if there is enough room left to pull this argument.
     BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
-      .addReg(OffsetReg)
-      .addImm(MaxOffset + 8 - ArgSizeA8);
+        .addReg(OffsetReg)
+        .addImm(MaxOffset + 8 - ArgSizeA8);
 
     // Branch to "overflowMBB" if offset >= max
     // Fall through to "offsetMBB" otherwise
     BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
-      .addMBB(overflowMBB).addImm(X86::COND_AE);
+        .addMBB(overflowMBB)
+        .addImm(X86::COND_AE);
   }
 
   // In offsetMBB, emit code to use the reg_save_area.
@@ -35888,8 +35937,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
     // Compute the offset for the next argument
     Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
     BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
-      .addReg(OffsetReg)
-      .addImm(UseFPOffset ? 16 : 8);
+        .addReg(OffsetReg)
+        .addImm(UseFPOffset ? 16 : 8);
 
     // Store it back into the va_list.
     BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
@@ -35902,8 +35951,7 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
         .setMemRefs(StoreOnlyMMO);
 
     // Jump to endMBB
-    BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
-      .addMBB(endMBB);
+    BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1)).addMBB(endMBB);
   }
 
   //
@@ -35944,7 +35992,7 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
         .addImm(~(uint64_t)(Alignment.value() - 1));
   } else {
     BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
-      .addReg(OverflowAddrReg);
+        .addReg(OverflowAddrReg);
   }
 
   // Compute the next overflow address after this argument.
@@ -35970,10 +36018,11 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
 
   // If we branched, emit the PHI to the front of endMBB.
   if (offsetMBB) {
-    BuildMI(*endMBB, endMBB->begin(), MIMD,
-            TII->get(X86::PHI), DestReg)
-      .addReg(OffsetDestReg).addMBB(offsetMBB)
-      .addReg(OverflowDestReg).addMBB(overflowMBB);
+    BuildMI(*endMBB, endMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
+        .addReg(OffsetDestReg)
+        .addMBB(offsetMBB)
+        .addReg(OverflowDestReg)
+        .addMBB(overflowMBB);
   }
 
   // Erase the pseudo instruction
@@ -35988,8 +36037,8 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
 // kill marker, and set it if it should. Returns the correct kill
 // marker value.
 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
-                                     MachineBasicBlock* BB,
-                                     const TargetRegisterInfo* TRI) {
+                                     MachineBasicBlock *BB,
+                                     const TargetRegisterInfo *TRI) {
   if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr))
     return false;
 
@@ -36456,11 +36505,21 @@ X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
   //
   //       + ---- <- ------------ <- ------------- <- ------------ +
   //       |                                                       |
-  // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
-  //                                                               |                                                               |
-  //                                                               + <- ----------- <- ------------ <- ----------- <- ------------ +
+  // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn
+  // probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
+  //                                                               | |
+  //                                                               + <-
+  //                                                               -----------
+  //                                                               <-
+  //                                                               ------------
+  //                                                               <-
+  //                                                               -----------
+  //                                                               <-
+  //                                                               ------------
+  //                                                               +
   //
-  // The property we want to enforce is to never have more than [page alloc] between two probes.
+  // The property we want to enforce is to never have more than [page alloc]
+  // between two probes.
 
   const unsigned XORMIOpc =
       TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
@@ -36553,56 +36612,61 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
   // Add code to the main basic block to check if the stack limit has been hit,
   // and if so, jump to mallocMBB otherwise to bumpMBB.
   BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
-  BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
-    .addReg(tmpSPVReg).addReg(sizeVReg);
-  BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
-    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
-    .addReg(SPLimitVReg);
+  BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr : X86::SUB32rr), SPLimitVReg)
+      .addReg(tmpSPVReg)
+      .addReg(sizeVReg);
+  BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr : X86::CMP32mr))
+      .addReg(0)
+      .addImm(1)
+      .addReg(0)
+      .addImm(TlsOffset)
+      .addReg(TlsReg)
+      .addReg(SPLimitVReg);
   BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
 
   // bumpMBB simply decreases the stack pointer, since we know the current
   // stacklet has enough space.
   BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
-    .addReg(SPLimitVReg);
+      .addReg(SPLimitVReg);
   BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
-    .addReg(SPLimitVReg);
+      .addReg(SPLimitVReg);
   BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
 
   // Calls into a routine in libgcc to allocate more space from the heap.
   const uint32_t *RegMask =
       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
   if (IsLP64) {
-    BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
-      .addReg(sizeVReg);
+    BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI).addReg(sizeVReg);
     BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
-      .addExternalSymbol("__morestack_allocate_stack_space")
-      .addRegMask(RegMask)
-      .addReg(X86::RDI, RegState::Implicit)
-      .addReg(X86::RAX, RegState::ImplicitDefine);
+        .addExternalSymbol("__morestack_allocate_stack_space")
+        .addRegMask(RegMask)
+        .addReg(X86::RDI, RegState::Implicit)
+        .addReg(X86::RAX, RegState::ImplicitDefine);
   } else if (Is64Bit) {
-    BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
-      .addReg(sizeVReg);
+    BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI).addReg(sizeVReg);
     BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
-      .addExternalSymbol("__morestack_allocate_stack_space")
-      .addRegMask(RegMask)
-      .addReg(X86::EDI, RegState::Implicit)
-      .addReg(X86::EAX, RegState::ImplicitDefine);
+        .addExternalSymbol("__morestack_allocate_stack_space")
+        .addRegMask(RegMask)
+        .addReg(X86::EDI, RegState::Implicit)
+        .addReg(X86::EAX, RegState::ImplicitDefine);
   } else {
-    BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
-      .addImm(12);
+    BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg)
+        .addReg(physSPReg)
+        .addImm(12);
     BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
     BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
-      .addExternalSymbol("__morestack_allocate_stack_space")
-      .addRegMask(RegMask)
-      .addReg(X86::EAX, RegState::ImplicitDefine);
+        .addExternalSymbol("__morestack_allocate_stack_space")
+        .addRegMask(RegMask)
+        .addReg(X86::EAX, RegState::ImplicitDefine);
   }
 
   if (!Is64Bit)
-    BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
-      .addImm(16);
+    BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg)
+        .addReg(physSPReg)
+        .addImm(16);
 
   BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
-    .addReg(IsLP64 ? X86::RAX : X86::EAX);
+      .addReg(IsLP64 ? X86::RAX : X86::EAX);
   BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
 
   // Set up the CFG correctly.
@@ -36657,7 +36721,8 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
   RestoreMBB->setIsEHPad(true);
 
   auto RestoreMBBI = RestoreMBB->begin();
-  BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
+  BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4))
+      .addMBB(TargetMBB);
   return BB;
 }
 
@@ -36679,9 +36744,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   // proper register mask.
   const uint32_t *RegMask =
-      Subtarget.is64Bit() ?
-      Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
-      Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
+      Subtarget.is64Bit()
+          ? Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask()
+          : Subtarget.getRegisterInfo()->getCallPreservedMask(*F,
+                                                              CallingConv::C);
   if (Subtarget.is64Bit()) {
     MachineInstrBuilder MIB =
         BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
@@ -36937,8 +37003,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
   MemOpndSlot = CurOp;
 
   MVT PVT = getPointerTy(MF->getDataLayout());
-  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
-         "Invalid Pointer Size!");
+  assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
 
   // For v = setjmp(buf), we generate
   //
@@ -36986,19 +37051,19 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
     LabelReg = MRI.createVirtualRegister(PtrRC);
     if (Subtarget.is64Bit()) {
       MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
-              .addReg(X86::RIP)
-              .addImm(0)
-              .addReg(0)
-              .addMBB(restoreMBB)
-              .addReg(0);
+                .addReg(X86::RIP)
+                .addImm(0)
+                .addReg(0)
+                .addMBB(restoreMBB)
+                .addReg(0);
     } else {
-      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+      const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
       MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
-              .addReg(XII->getGlobalBaseReg(MF))
-              .addImm(0)
-              .addReg(0)
-              .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
-              .addReg(0);
+                .addReg(XII->getGlobalBaseReg(MF))
+                .addImm(0)
+                .addReg(0)
+                .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
+                .addReg(0);
     }
   } else
     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
@@ -37022,7 +37087,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 
   // Setup
   MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
-          .addMBB(restoreMBB);
+            .addMBB(restoreMBB);
 
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   MIB.addRegMask(RegInfo->getNoPreservedMask());
@@ -37050,9 +37115,9 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
     Register FramePtr = RegInfo->getFrameRegister(*MF);
     Register BasePtr = RegInfo->getBaseRegister();
     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
-    addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
-                 FramePtr, true, X86FI->getRestoreBasePointerOffset())
-      .setMIFlag(MachineInstr::FrameSetup);
+    addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr), FramePtr,
+                 true, X86FI->getRestoreBasePointerOffset())
+        .setMIFlag(MachineInstr::FrameSetup);
   }
   BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
   BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
@@ -37135,9 +37200,9 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
   if (PVT == MVT::i64) {
     Register TmpZReg = MRI.createVirtualRegister(PtrRC);
     BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
-      .addImm(0)
-      .addReg(ZReg)
-      .addImm(X86::sub_32bit);
+        .addImm(0)
+        .addReg(ZReg)
+        .addImm(X86::sub_32bit);
     ZReg = TmpZReg;
   }
 
@@ -37268,11 +37333,10 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands());
 
   MVT PVT = getPointerTy(MF->getDataLayout());
-  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
-         "Invalid Pointer Size!");
+  assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
 
   const TargetRegisterClass *RC =
-    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+      (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   Register Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -37654,7 +37718,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   const MIMetadata MIMD(MI);
 
   auto TMMImmToTMMReg = [](unsigned Imm) {
-    assert (Imm < 8 && "Illegal tmm index");
+    assert(Imm < 8 && "Illegal tmm index");
     return X86::TMM0 + Imm;
   };
   auto TMMImmToTMMPair = [](unsigned Imm) {
@@ -37794,29 +37858,30 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
     Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
     BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
-      .addReg(OldCW, RegState::Kill).addImm(0xC00);
+        .addReg(OldCW, RegState::Kill)
+        .addImm(0xC00);
 
     // Extract to 16 bits.
     Register NewCW16 =
         MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
     BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
-      .addReg(NewCW, RegState::Kill, X86::sub_16bit);
+        .addReg(NewCW, RegState::Kill, X86::sub_16bit);
 
     // Prepare memory for FLDCW.
     int NewCWFrameIdx =
         MF->getFrameInfo().CreateStackObject(2, Align(2), false);
     addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
                       NewCWFrameIdx)
-      .addReg(NewCW16, RegState::Kill);
+        .addReg(NewCW16, RegState::Kill);
 
     // Reload the modified control word now...
-    addFrameReference(BuildMI(*BB, MI, MIMD,
-                              TII->get(X86::FLDCW16m)), NewCWFrameIdx);
+    addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
+                      NewCWFrameIdx);
 
     // Get the X86 opcode to use.
     unsigned Opc;
     switch (MI.getOpcode()) {
-    // clang-format off
+      // clang-format off
     default: llvm_unreachable("illegal opcode!");
     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
@@ -37827,7 +37892,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
-    // clang-format on
+      // clang-format on
     }
 
     X86AddressMode AM = getAddressFromInstr(&MI, 0);
@@ -38050,23 +38115,44 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTTMMULTF32PS: {
     unsigned Opc;
     switch (MI.getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
-    case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
-    case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
-    case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
-    case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
-    case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
-    case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
+    default:
+      llvm_unreachable("illegal opcode!");
+    case X86::PTDPBSSD:
+      Opc = X86::TDPBSSD;
+      break;
+    case X86::PTDPBSUD:
+      Opc = X86::TDPBSUD;
+      break;
+    case X86::PTDPBUSD:
+      Opc = X86::TDPBUSD;
+      break;
+    case X86::PTDPBUUD:
+      Opc = X86::TDPBUUD;
+      break;
+    case X86::PTDPBF16PS:
+      Opc = X86::TDPBF16PS;
+      break;
+    case X86::PTDPFP16PS:
+      Opc = X86::TDPFP16PS;
+      break;
     case X86::PTCMMIMFP16PS:
       Opc = X86::TCMMIMFP16PS;
       break;
     case X86::PTCMMRLFP16PS:
       Opc = X86::TCMMRLFP16PS;
       break;
-    case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
-    case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
-    case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
-    case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
+    case X86::PTDPBF8PS:
+      Opc = X86::TDPBF8PS;
+      break;
+    case X86::PTDPBHF8PS:
+      Opc = X86::TDPBHF8PS;
+      break;
+    case X86::PTDPHBF8PS:
+      Opc = X86::TDPHBF8PS;
+      break;
+    case X86::PTDPHF8PS:
+      Opc = X86::TDPHF8PS;
+      break;
     case X86::PTTDPBF16PS:
       Opc = X86::TTDPBF16PS;
       break;
@@ -38119,7 +38205,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTILESTORED: {
     unsigned Opc;
     switch (MI.getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
+    default:
+      llvm_unreachable("illegal opcode!");
 #define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
     case X86::PTILELOADD:
       Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
@@ -38305,11 +38392,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 //                           X86 Optimization Hooks
 //===----------------------------------------------------------------------===//
 
-bool
-X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
-                                                const APInt &DemandedBits,
-                                                const APInt &DemandedElts,
-                                                TargetLoweringOpt &TLO) const {
+bool X86TargetLowering::targetShrinkDemandedConstant(
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    TargetLoweringOpt &TLO) const {
   EVT VT = Op.getValueType();
   unsigned Opcode = Op.getOpcode();
   unsigned EltSize = VT.getScalarSizeInBits();
@@ -38494,16 +38579,15 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   unsigned NumElts = DemandedElts.getBitWidth();
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
-  assert((Opc >= ISD::BUILTIN_OP_END ||
-          Opc == ISD::INTRINSIC_WO_CHAIN ||
-          Opc == ISD::INTRINSIC_W_CHAIN ||
-          Opc == ISD::INTRINSIC_VOID) &&
+  assert((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN ||
+          Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) &&
          "Should use MaskedValueIsZero if you don't know whether Op"
          " is a target node!");
 
   Known.resetAll();
   switch (Opc) {
-  default: break;
+  default:
+    break;
   case X86ISD::MUL_IMM: {
     KnownBits Known2;
     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -38734,7 +38818,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
 
-    if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+    if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
       unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
       unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
 
@@ -38908,7 +38992,8 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       unsigned NumElts = VT.getVectorNumElements();
       if (Mask.size() == NumElts) {
         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
-        Known.Zero.setAllBits(); Known.One.setAllBits();
+        Known.Zero.setAllBits();
+        Known.One.setAllBits();
         for (unsigned i = 0; i != NumElts; ++i) {
           if (!DemandedElts[i])
             continue;
@@ -39053,16 +39138,18 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
   case X86ISD::ANDNP: {
     unsigned Tmp0 =
         DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
-    if (Tmp0 == 1) return 1; // Early out.
+    if (Tmp0 == 1)
+      return 1; // Early out.
     unsigned Tmp1 =
         DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
     return std::min(Tmp0, Tmp1);
   }
 
   case X86ISD::CMOV: {
-    unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
-    if (Tmp0 == 1) return 1;  // Early out.
-    unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    if (Tmp0 == 1)
+      return 1; // Early out.
+    unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
     return std::min(Tmp0, Tmp1);
   }
   }
@@ -39438,7 +39525,6 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
         PermuteImm = (unsigned)ShiftAmt;
         return true;
       }
-
     }
   }
 
@@ -39498,7 +39584,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
 
   // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
   if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
-      ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
+      ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) &&
+       Subtarget.hasInt256()) ||
       ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
     if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
                              Subtarget)) {
@@ -40057,9 +40144,9 @@ static SDValue combineX86ShuffleChain(
         SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
         SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
         return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
-                          CanonicalizeShuffleInput(RootVT, LHS),
-                          CanonicalizeShuffleInput(RootVT, RHS),
-                          DAG.getTargetConstant(PermMask, DL, MVT::i8));
+                           CanonicalizeShuffleInput(RootVT, LHS),
+                           CanonicalizeShuffleInput(RootVT, RHS),
+                           DAG.getTargetConstant(PermMask, DL, MVT::i8));
       }
     }
   }
@@ -40153,8 +40240,8 @@ static SDValue combineX86ShuffleChain(
     }
 
     if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
-                                 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
-                                 PermuteImm) &&
+                                 AllowIntDomain, DAG, Subtarget, Shuffle,
+                                 ShuffleVT, PermuteImm) &&
         (!IsMaskedShuffle ||
          (NumRootElts == ShuffleVT.getVectorNumElements()))) {
       if (Depth == 0 && RootOpc == Shuffle)
@@ -41032,11 +41119,9 @@ static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef<SDValue> Ops,
 }
 
 namespace llvm {
-  namespace X86 {
-    enum {
-      MaxShuffleCombineDepth = 8
-    };
-  } // namespace X86
+namespace X86 {
+enum { MaxShuffleCombineDepth = 8 };
+} // namespace X86
 } // namespace llvm
 
 /// Fully generic combining of x86 shuffle instructions.
@@ -41440,7 +41525,8 @@ static SDValue combineX86ShufflesRecursively(
 
     // The Op itself may be of different VT, so we need to scale the mask.
     unsigned NumOpElts = Op.getValueType().getVectorNumElements();
-    APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
+    APInt OpScaledDemandedElts =
+        APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
 
     // Can this operand be simplified any further, given it's demanded elements?
     if (SDValue NewOp = TLI.SimplifyMultipleUseDemandedVectorElts(
@@ -42239,7 +42325,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
         ISD::isNormalLoad(Src.getNode())) {
       LoadSDNode *LN = cast<LoadSDNode>(Src);
       SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-      SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+      SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
       SDValue BcastLd =
           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
                                   LN->getMemoryVT(), LN->getMemOperand());
@@ -42271,7 +42357,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
         // Unless its volatile or atomic.
         if (LN->isSimple()) {
           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-          SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+          SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
           SDValue BcastLd = DAG.getMemIntrinsicNode(
               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
               LN->getPointerInfo(), LN->getBaseAlign(),
@@ -42289,7 +42375,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
         LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
         if (LN->getMemoryVT().getSizeInBits() == 16) {
           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-          SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+          SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
           SDValue BcastLd =
               DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
                                       LN->getMemoryVT(), LN->getMemOperand());
@@ -42316,7 +42402,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
           SDVTList Tys = DAG.getVTList(VT, MVT::Other);
           SDValue Ptr = DAG.getMemBasePlusOffset(
               LN->getBasePtr(), TypeSize::getFixed(Offset), DL);
-          SDValue Ops[] = { LN->getChain(), Ptr };
+          SDValue Ops[] = {LN->getChain(), Ptr};
           SDValue BcastLd = DAG.getMemIntrinsicNode(
               X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
               LN->getPointerInfo().getWithOffset(Offset), LN->getBaseAlign(),
@@ -42334,7 +42420,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
       MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
       if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
         SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-        SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+        SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
         SDValue BcastLd =
             DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
                                     LN->getMemoryVT(), LN->getMemOperand());
@@ -42368,20 +42454,6 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
   case X86ISD::VZEXT_MOVL: {
     SDValue N0 = N.getOperand(0);
 
-    // Fold (vzmovl (shift x, y)) -> (shift (vzmovl x), y)
-    // Zeroing out the upper elements means we're just shifting a zero value.
-    // TODO: Try harder to move vzmovl upward towards SCALAR_TO_VECTOR nodes.
-    // TODO: Move this to canonicalizeShuffleWithOp once we add zero handling.
-    if (N0.getOpcode() == X86ISD::VSHL || N0.getOpcode() == X86ISD::VSHLI ||
-        N0.getOpcode() == X86ISD::VSRL || N0.getOpcode() == X86ISD::VSRLI ||
-        N0.getOpcode() == X86ISD::VSRA || N0.getOpcode() == X86ISD::VSRAI) {
-      if (N0.hasOneUse())
-        return DAG.getNode(
-            N0.getOpcode(), DL, VT,
-            DAG.getNode(X86ISD::VZEXT_MOVL, DL, VT, N0.getOperand(0)),
-            N0.getOperand(1));
-    }
-
     // If this a vzmovl of a full vector load, replace it with a vzload, unless
     // the load is volatile.
     if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
@@ -42816,13 +42888,13 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
     if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
       auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
       if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
-        SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
-                                   MemIntr->getBasePtr(),
-                                   MemIntr->getMemOperand());
-        SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
-                           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
-                                       Load),
-                           DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
+        SDValue Load =
+            DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
+                        MemIntr->getBasePtr(), MemIntr->getMemOperand());
+        SDValue Insert = DAG.getNode(
+            X86ISD::INSERTPS, DL, VT, Op0,
+            DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Load),
+            DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
         DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
         return Insert;
       }
@@ -42976,8 +43048,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
         (V.getOpcode() == X86ISD::PSHUFLW ||
          V.getOpcode() == X86ISD::PSHUFHW) &&
-        V.getOpcode() != N.getOpcode() &&
-        V.hasOneUse() && V.getOperand(0).hasOneUse()) {
+        V.getOpcode() != N.getOpcode() && V.hasOneUse() &&
+        V.getOperand(0).hasOneUse()) {
       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
       if (D.getOpcode() == X86ISD::PSHUFD) {
         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
@@ -43051,11 +43123,11 @@ static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
 /// are written to the parameters \p Opnd0 and \p Opnd1.
 ///
-/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
-/// so it is easier to generically match. We also insert dummy vector shuffle
-/// nodes for the operands which explicitly discard the lanes which are unused
-/// by this operation to try to flow through the rest of the combiner
-/// the fact that they're unused.
+/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle
+/// nodes so it is easier to generically match. We also insert dummy vector
+/// shuffle nodes for the operands which explicitly discard the lanes which are
+/// unused by this operation to try to flow through the rest of the combiner the
+/// fact that they're unused.
 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
                              bool &IsSubAdd) {
@@ -43089,13 +43161,15 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
   // commute the FADD operands.
   SDValue LHS, RHS;
   if (V1.getOpcode() == ISD::FSUB) {
-    LHS = V1->getOperand(0); RHS = V1->getOperand(1);
+    LHS = V1->getOperand(0);
+    RHS = V1->getOperand(1);
     if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
         (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
       return false;
   } else {
     assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
-    LHS = V2->getOperand(0); RHS = V2->getOperand(1);
+    LHS = V2->getOperand(0);
+    RHS = V2->getOperand(1);
     if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
         (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
       return false;
@@ -43107,8 +43181,8 @@ static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
     return false;
 
   // It's a subadd if the vector in the even parity is an FADD.
-  IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
-                     : V2->getOpcode() == ISD::FADD;
+  IsSubAdd =
+      Op0Even ? V1->getOpcode() == ISD::FADD : V2->getOpcode() == ISD::FADD;
 
   Opnd0 = LHS;
   Opnd1 = RHS;
@@ -43446,7 +43520,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
 
     // TODO: Multiply by zero.
 
-    // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
+    // If RHS/LHS elements are known zero then we don't need the LHS/RHS
+    // equivalent.
     APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
     if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
                                    Depth + 1))
@@ -44206,7 +44281,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
   // For splats, unless we *only* demand the 0'th element,
   // stop attempts at simplification here, we aren't going to improve things,
   // this is better than any potential shuffle.
-  if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
+  if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/ false))
     return false;
 
   // Get target/faux shuffle mask.
@@ -44303,7 +44378,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
   EVT VT = Op.getValueType();
   unsigned BitWidth = OriginalDemandedBits.getBitWidth();
   unsigned Opc = Op.getOpcode();
-  switch(Opc) {
+  switch (Opc) {
   case X86ISD::VTRUNC: {
     KnownBits KnownOp;
     SDValue Src = Op.getOperand(0);
@@ -44311,8 +44386,10 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
 
     // Simplify the input, using demanded bit information.
     APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
-    APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
-    if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
+    APInt DemandedElts =
+        OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
+    if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO,
+                             Depth + 1))
       return true;
     break;
   }
@@ -44416,7 +44493,8 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
       }
     }
 
-    // If we are only demanding sign bits then we can use the shift source directly.
+    // If we are only demanding sign bits then we can use the shift source
+    // directly.
     unsigned NumSignBits =
         TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
     unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
@@ -44607,8 +44685,8 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
         return true;
 
       KnownBits KnownVec;
-      if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
-                               KnownVec, TLO, Depth + 1))
+      if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts, KnownVec,
+                               TLO, Depth + 1))
         return true;
 
       if (SDValue V = SimplifyMultipleUseDemandedBits(
@@ -45145,13 +45223,13 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
 
 // Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
 static unsigned getAltBitOpcode(unsigned Opcode) {
-  switch(Opcode) {
-  // clang-format off
+  switch (Opcode) {
+    // clang-format off
   case ISD::AND: return X86ISD::FAND;
   case ISD::OR: return X86ISD::FOR;
   case ISD::XOR: return X86ISD::FXOR;
   case X86ISD::ANDNP: return X86ISD::FANDN;
-  // clang-format on
+    // clang-format on
   }
   llvm_unreachable("Unknown bitwise opcode");
 }
@@ -45373,8 +45451,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
 // Convert a vXi1 constant build vector to the same width scalar integer.
 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
   EVT SrcVT = Op.getValueType();
-  assert(SrcVT.getVectorElementType() == MVT::i1 &&
-         "Expected a vXi1 vector");
+  assert(SrcVT.getVectorElementType() == MVT::i1 && "Expected a vXi1 vector");
   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
          "Expected a constant build vector");
 
@@ -45408,8 +45485,7 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // Look for logic ops.
-  if (Op.getOpcode() != ISD::AND &&
-      Op.getOpcode() != ISD::OR &&
+  if (Op.getOpcode() != ISD::AND && Op.getOpcode() != ISD::OR &&
       Op.getOpcode() != ISD::XOR)
     return SDValue();
 
@@ -45700,7 +45776,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
   // and the vbroadcast_load are both integer or both fp. In some cases this
   // will remove the bitcast entirely.
   if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
-       VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
+      VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
     auto *BCast = cast<MemIntrinsicSDNode>(N0);
     unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
     unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
@@ -45713,7 +45789,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
       LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
 
       SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
-      SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
+      SDValue Ops[] = {BCast->getChain(), BCast->getBasePtr()};
       SDValue ResNode =
           DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
                                   MemVT, BCast->getMemOperand());
@@ -45763,7 +45839,7 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
       bool LowUndef = true, AllUndefOrZero = true;
       for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
         SDValue Op = N0.getOperand(i);
-        LowUndef &= Op.isUndef() || (i >= e/2);
+        LowUndef &= Op.isUndef() || (i >= e / 2);
         AllUndefOrZero &= isNullConstantOrUndef(Op);
       }
       if (AllUndefOrZero) {
@@ -45805,8 +45881,8 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
 
   // Try to remove a bitcast of constant vXi1 vector. We have to legalize
   // most of these to scalar anyway.
-  if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
-      SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
+  if (Subtarget.hasAVX512() && VT.isScalarInteger() && SrcVT.isVector() &&
+      SrcVT.getVectorElementType() == MVT::i1 &&
       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
     return combinevXi1ConstantToInteger(N0, DAG);
   }
@@ -45824,8 +45900,8 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
   // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
   // Turn it into a sign bit compare that produces a k-register. This avoids
   // a trip through a GPR.
-  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
-      VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
+      VT.getVectorElementType() == MVT::i1 &&
       isPowerOf2_32(VT.getVectorNumElements())) {
     unsigned NumElts = VT.getVectorNumElements();
     SDValue Src = N0;
@@ -45879,12 +45955,12 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
   // transferring the SSE operand to integer register and back.
   unsigned FPOpcode;
   switch (N0.getOpcode()) {
-  // clang-format off
+    // clang-format off
   case ISD::AND: FPOpcode = X86ISD::FAND; break;
   case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
   case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
   default: return SDValue();
-  // clang-format on
+    // clang-format on
   }
 
   // Check if we have a bitcast from another integer type as well.
@@ -46006,7 +46082,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
   // Actually build the DotProduct, split as 256/512 bits for
   // AVXVNNI/AVX512VNNI.
   auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
-                       ArrayRef<SDValue> Ops) {
+                      ArrayRef<SDValue> Ops) {
     MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
     return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
   };
@@ -46043,7 +46119,7 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
     return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
   };
   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
-  return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
+  return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {SadOp0, SadOp1},
                           PSADBWBuilder);
 }
 
@@ -46122,7 +46198,8 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
                      DAG.getVectorIdxConstant(0, DL));
 }
 
-// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
+// Attempt to replace an all_of/any_of/parity style horizontal reduction with a
+// MOVMSK.
 static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
   // Bail without SSE2.
@@ -46387,9 +46464,9 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   if (Stages > 3) {
     unsigned SadElems = SadVT.getVectorNumElements();
 
-    for(unsigned i = Stages - 3; i > 0; --i) {
+    for (unsigned i = Stages - 3; i > 0; --i) {
       SmallVector<int, 16> Mask(SadElems, -1);
-      for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
+      for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
         Mask[j] = MaskEnd + j;
 
       SDValue Shuffle =
@@ -46706,10 +46783,10 @@ static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
     SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
                                Vec.getOperand(0).getValueType().getScalarType(),
                                Vec.getOperand(0), Index);
-    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
-                               Vec.getOperand(1), Index);
-    SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
-                               Vec.getOperand(2), Index);
+    SDValue Ext1 =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Vec.getOperand(1), Index);
+    SDValue Ext2 =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Vec.getOperand(2), Index);
     return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
   }
 
@@ -46989,8 +47066,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
     }
 
-    // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
-    // Improves lowering of bool masks on rust which splits them into byte array.
+    // Convert extract_element(bitcast(<X x i1>) ->
+    // bitcast(extract_subvector()). Improves lowering of bool masks on rust
+    // which splits them into byte array.
     if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
       SDValue Src = peekThroughBitcasts(InputVector);
       if (Src.getValueType().getScalarType() == MVT::i1 &&
@@ -47445,8 +47523,7 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const X86Subtarget &Subtarget) {
   SDValue Cond = N->getOperand(0);
-  if ((N->getOpcode() != ISD::VSELECT &&
-       N->getOpcode() != X86ISD::BLENDV) ||
+  if ((N->getOpcode() != ISD::VSELECT && N->getOpcode() != X86ISD::BLENDV) ||
       ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
     return SDValue();
 
@@ -47727,7 +47804,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
     // Check for x CC y ? x : y.
     if (DAG.isEqualTo(LHS, Op0) && DAG.isEqualTo(RHS, Op1)) {
       switch (CC) {
-      default: break;
+      default:
+        break;
       case ISD::SETULT:
         // Converting this to a min would handle NaNs incorrectly, and swapping
         // the operands would cause it to handle comparisons between positive
@@ -47792,10 +47870,11 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         Opcode = X86ISD::FMAX;
         break;
       }
-    // Check for x CC y ? y : x -- a min/max with reversed arms.
+      // Check for x CC y ? y : x -- a min/max with reversed arms.
     } else if (DAG.isEqualTo(LHS, Op1) && DAG.isEqualTo(RHS, Op0)) {
       switch (CC) {
-      default: break;
+      default:
+        break;
       case ISD::SETOGE:
         // Converting this to a min would handle comparisons between positive
         // and negative zero incorrectly, and swapping the operands would
@@ -47999,13 +48078,13 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
           Cond1 == InnerSetCC.getOperand(1)) {
         ISD::CondCode NewCC;
         switch (CC == ISD::SETEQ ? InnerCC : CC) {
-        // clang-format off
+          // clang-format off
         case ISD::SETGT:  NewCC = ISD::SETGE; break;
         case ISD::SETLT:  NewCC = ISD::SETLE; break;
         case ISD::SETUGT: NewCC = ISD::SETUGE; break;
         case ISD::SETULT: NewCC = ISD::SETULE; break;
         default: NewCC = ISD::SETCC_INVALID; break;
-        // clang-format on
+          // clang-format on
         }
         if (NewCC != ISD::SETCC_INVALID) {
           Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
@@ -48178,9 +48257,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
     // 16-bit lacks a proper blendv.
     unsigned EltBitWidth = VT.getScalarSizeInBits();
     bool CanShiftBlend =
-        TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
-                                (Subtarget.hasAVX2() && EltBitWidth == 64) ||
-                                (Subtarget.hasXOP()));
+        TLI.isTypeLegal(VT) &&
+        ((Subtarget.hasAVX() && EltBitWidth == 32) ||
+         (Subtarget.hasAVX2() && EltBitWidth == 64) || (Subtarget.hasXOP()));
     if (CanShiftBlend &&
         ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
           return C->getAPIntValue().isPowerOf2();
@@ -48415,7 +48494,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   SDValue Op2 = Cmp.getOperand(1);
 
   SDValue SetCC;
-  const ConstantSDNode* C = nullptr;
+  const ConstantSDNode *C = nullptr;
   bool needOppositeCond = (CC == X86::COND_E);
   bool checkAgainstTrue = false; // Is it a comparison against 1?
 
@@ -48436,8 +48515,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   bool truncatedToBoolWithAnd = false;
   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
-         SetCC.getOpcode() == ISD::TRUNCATE ||
-         SetCC.getOpcode() == ISD::AND) {
+         SetCC.getOpcode() == ISD::TRUNCATE || SetCC.getOpcode() == ISD::AND) {
     if (SetCC.getOpcode() == ISD::AND) {
       int OpIdx = -1;
       if (isOneConstant(SetCC.getOperand(0)))
@@ -48480,13 +48558,13 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
     if (!FVal) {
       SDValue Op = SetCC.getOperand(0);
       // Skip 'zext' or 'trunc' node.
-      if (Op.getOpcode() == ISD::ZERO_EXTEND ||
-          Op.getOpcode() == ISD::TRUNCATE)
+      if (Op.getOpcode() == ISD::ZERO_EXTEND || Op.getOpcode() == ISD::TRUNCATE)
         Op = Op.getOperand(0);
       // A special case for rdrand/rdseed, where 0 is set if false cond is
       // found.
       if ((Op.getOpcode() != X86ISD::RDRAND &&
-           Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
+           Op.getOpcode() != X86ISD::RDSEED) ||
+          Op.getResNo() != 0)
         return SDValue();
     }
     // Quit if false value is not the constant 0 or 1.
@@ -48531,7 +48609,8 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
 
   SDValue SetCC0, SetCC1;
   switch (Cond->getOpcode()) {
-  default: return false;
+  default:
+    return false;
   case ISD::AND:
   case X86ISD::AND:
     isAnd = true;
@@ -48596,8 +48675,7 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
         }
         // If this is a check of the z flag of an add with 1, switch to the
         // C flag.
-        if (CarryCC == X86::COND_E &&
-            CarryOp1.getOpcode() == X86ISD::ADD &&
+        if (CarryCC == X86::COND_E && CarryOp1.getOpcode() == X86ISD::ADD &&
             isOneConstant(CarryOp1.getOperand(1)))
           return CarryOp1;
       } else if (FoundAndLSB) {
@@ -49108,12 +49186,11 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
 
       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
       // for any integer data type, including i8/i16.
-      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+      if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
         Cond = getSETCC(CC, Cond, DL, DAG);
 
         // Zero extend the condition if needed.
-        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
-                           FalseC->getValueType(0), Cond);
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                            SDValue(FalseC, 0));
         return Cond;
@@ -49129,24 +49206,25 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
         bool isFastMultiplier = false;
         if (Diff.ult(10)) {
           switch (Diff.getZExtValue()) {
-          default: break;
-          case 1:  // result = add base, cond
-          case 2:  // result = lea base(    , cond*2)
-          case 3:  // result = lea base(cond, cond*2)
-          case 4:  // result = lea base(    , cond*4)
-          case 5:  // result = lea base(cond, cond*4)
-          case 8:  // result = lea base(    , cond*8)
-          case 9:  // result = lea base(cond, cond*8)
+          default:
+            break;
+          case 1: // result = add base, cond
+          case 2: // result = lea base(    , cond*2)
+          case 3: // result = lea base(cond, cond*2)
+          case 4: // result = lea base(    , cond*4)
+          case 5: // result = lea base(cond, cond*4)
+          case 8: // result = lea base(    , cond*8)
+          case 9: // result = lea base(cond, cond*8)
             isFastMultiplier = true;
             break;
           }
         }
 
         if (isFastMultiplier) {
-          Cond = getSETCC(CC, Cond, DL ,DAG);
+          Cond = getSETCC(CC, Cond, DL, DAG);
           // Zero extend the condition if needed.
-          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
-                             Cond);
+          Cond =
+              DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
           // Scale the condition by the difference.
           if (Diff != 1)
             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
@@ -49856,7 +49934,7 @@ static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
                                    const SDLoc &DL,
                                    const X86Subtarget &Subtarget) {
   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
-           "SRL or SRA node is required here!");
+         "SRL or SRA node is required here!");
 
   if (!Subtarget.hasSSE2())
     return SDValue();
@@ -49934,8 +50012,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG,
 
   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
   // since the result of setcc_c is all zero's or all ones.
-  if (VT.isInteger() && !VT.isVector() &&
-      N1C && N0.getOpcode() == ISD::AND &&
+  if (VT.isInteger() && !VT.isVector() && N1C && N0.getOpcode() == ISD::AND &&
       N0.getOperand(1).getOpcode() == ISD::Constant) {
     SDValue N00 = N0.getOperand(0);
     APInt Mask = N0.getConstantOperandAPInt(1);
@@ -50019,7 +50096,7 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
   if (SraConst.isNegative())
     return SDValue();
 
-  for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
+  for (MVT SVT : {MVT::i8, MVT::i16, MVT::i32}) {
     unsigned ShiftSize = SVT.getSizeInBits();
     // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
@@ -50353,8 +50430,8 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
 
   // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
   // truncate to create a larger truncate.
-  if (Subtarget.hasAVX512() &&
-      N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
+  if (Subtarget.hasAVX512() && N0.getOpcode() == ISD::TRUNCATE &&
+      N1.isUndef() && VT == MVT::v16i8 &&
       N0.getOperand(0).getValueType() == MVT::v8i32) {
     if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
         (!IsSigned &&
@@ -50701,7 +50778,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
 
     SDValue CMP00 = CMP0->getOperand(0);
     SDValue CMP01 = CMP0->getOperand(1);
-    EVT     VT    = CMP00.getValueType();
+    EVT VT = CMP00.getValueType();
 
     if (VT == MVT::f32 || VT == MVT::f64 ||
         (VT == MVT::f16 && Subtarget.hasFP16())) {
@@ -50727,8 +50804,10 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
       }
 
       if (!ExpectingFlags) {
-        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
-        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+        enum X86::CondCode cc0 =
+            (enum X86::CondCode)N0.getConstantOperandVal(0);
+        enum X86::CondCode cc1 =
+            (enum X86::CondCode)N1.getConstantOperandVal(0);
 
         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
           X86::CondCode tmp = cc0;
@@ -50736,7 +50815,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
           cc1 = tmp;
         }
 
-        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
+        if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
           // FIXME: need symbolic constants for these magic numbers.
           // See X86ATTInstPrinter.cpp:printSSECC().
@@ -50746,7 +50825,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
                             DAG.getTargetConstant(x86cc, DL, MVT::i8));
             // Need to fill with zeros to ensure the bitcast will produce zeroes
-            // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
+            // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee
+            // that.
             SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
                                       DAG.getConstant(0, DL, MVT::v16i1),
                                       FSetCC, DAG.getVectorIdxConstant(0, DL));
@@ -50778,8 +50858,8 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
                                       DAG.getConstant(1, DL, IntVT));
-          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
-                                              ANDed);
+          SDValue OneBitOfTruth =
+              DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
           return OneBitOfTruth;
         }
       }
@@ -50967,7 +51047,8 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
   assert(VT.isVector() && "Expected vector type");
   assert((N.getOpcode() == ISD::ANY_EXTEND ||
           N.getOpcode() == ISD::ZERO_EXTEND ||
-          N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+          N.getOpcode() == ISD::SIGN_EXTEND) &&
+         "Invalid Node");
 
   SDValue Narrow = N.getOperand(0);
   EVT NarrowVT = Narrow.getValueType();
@@ -50977,26 +51058,27 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
   if (!Op)
     return SDValue();
   switch (N.getOpcode()) {
-  default: llvm_unreachable("Unexpected opcode");
+  default:
+    llvm_unreachable("Unexpected opcode");
   case ISD::ANY_EXTEND:
     return Op;
   case ISD::ZERO_EXTEND:
     return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
   case ISD::SIGN_EXTEND:
-    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
-                       Op, DAG.getValueType(NarrowVT));
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
+                       DAG.getValueType(NarrowVT));
   }
 }
 
 static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
   unsigned FPOpcode;
   switch (Opcode) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Unexpected input node for FP logic conversion");
   case ISD::AND: FPOpcode = X86ISD::FAND; break;
   case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
   case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
-  // clang-format on
+    // clang-format on
   }
   return FPOpcode;
 }
@@ -51442,8 +51524,7 @@ static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
   SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
                               DAG.getConstant(0, dl, SubVecVT));
   Ops[0] = SubVec;
-  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
-                               Ops);
+  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, Ops);
   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
   return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
 }
@@ -52615,7 +52696,8 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
 
         if (NotCond) {
           SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
-          R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
+          R = DAG.getNode(ISD::MUL, dl, VT, R,
+                          DAG.getConstant(Val + 1, dl, VT));
           R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
           return R;
         }
@@ -52757,7 +52839,7 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   switch (VT.getSimpleVT().SimpleTy) {
-  // clang-format off
+    // clang-format off
   default: return SDValue();
   case MVT::v16i8:
   case MVT::v8i16:
@@ -52887,8 +52969,8 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
   // split across two registers. We can use a packusdw+perm to clamp to 0-65535
   // and concatenate at the same time. Then we can use a final vpmovuswb to
   // clip to 0-255.
-  if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
-      InVT == MVT::v16i32 && VT == MVT::v16i8) {
+  if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && InVT == MVT::v16i32 &&
+      VT == MVT::v16i8) {
     if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
       // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
       SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
@@ -52904,11 +52986,12 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
   // FIXME: We could widen truncates to 512 to remove the VLX restriction.
   // If the result type is 256-bits or larger and we have disable 512-bit
   // registers, we should go ahead and use the pack instructions if possible.
-  bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
-                       (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
-                      (InVT.getSizeInBits() > 128) &&
-                      (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
-                      !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
+  bool PreferAVX512 =
+      ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
+       (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
+      (InVT.getSizeInBits() > 128) &&
+      (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
+      !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
 
   if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
       isPowerOf2_32(VT.getVectorNumElements()) &&
@@ -52921,8 +53004,8 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
                                              DAG, Subtarget);
         assert(Mid && "Failed to pack!");
-        SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
-                                           Subtarget);
+        SDValue V =
+            truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG, Subtarget);
         assert(V && "Failed to pack!");
         return V;
       } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
@@ -53244,10 +53327,9 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
     CastVT = VT.changeVectorElementType(EltVT);
   }
 
-  SDValue Load =
-      DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
-                  ML->getPointerInfo().getWithOffset(Offset),
-                  Alignment, ML->getMemOperand()->getFlags());
+  SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
+                             ML->getPointerInfo().getWithOffset(Offset),
+                             Alignment, ML->getMemOperand()->getFlags());
 
   SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
 
@@ -53278,8 +53360,8 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
   if (LoadFirstElt && LoadLastElt) {
     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
                                 ML->getMemOperand());
-    SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
-                                  ML->getPassThru());
+    SDValue Blend =
+        DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getPassThru());
     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
   }
 
@@ -53301,8 +53383,8 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
       VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
       DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
       ML->getAddressingMode(), ML->getExtensionType());
-  SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
-                                ML->getPassThru());
+  SDValue Blend =
+      DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getPassThru());
 
   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
 }
@@ -53382,8 +53464,8 @@ static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
 
   // Store that element at the appropriate offset from the base pointer.
   return DAG.getStore(MS->getChain(), DL, Extract, Addr,
-                      MS->getPointerInfo().getWithOffset(Offset),
-                      Alignment, MS->getMemOperand()->getFlags());
+                      MS->getPointerInfo().getWithOffset(Offset), Alignment,
+                      MS->getMemOperand()->getFlags());
 }
 
 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
@@ -53483,15 +53565,16 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
 
   // Turn vXi1 stores of constants into a scalar store.
   if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
-       VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
+       VT == MVT::v64i1) &&
+      VT == StVT && TLI.isTypeLegal(VT) &&
       ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
     // If its a v64i1 store without 64-bit support, we need two stores.
     if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
-      SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
-                                      StoredVal->ops().slice(0, 32));
+      SDValue Lo =
+          DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(0, 32));
       Lo = combinevXi1ConstantToInteger(Lo, DAG);
-      SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
-                                      StoredVal->ops().slice(32, 32));
+      SDValue Hi =
+          DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(32, 32));
       Hi = combinevXi1ConstantToInteger(Hi, DAG);
 
       SDValue Ptr0 = St->getBasePtr();
@@ -53591,9 +53674,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
       StoredVal.hasOneUse() &&
       TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
     bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
-    return EmitTruncSStore(IsSigned, St->getChain(),
-                           dl, StoredVal.getOperand(0), St->getBasePtr(),
-                           VT, St->getMemOperand(), DAG);
+    return EmitTruncSStore(IsSigned, St->getChain(), dl,
+                           StoredVal.getOperand(0), St->getBasePtr(), VT,
+                           St->getMemOperand(), DAG);
   }
 
   // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
@@ -53632,14 +53715,14 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   if (St->isTruncatingStore() && VT.isVector()) {
     if (TLI.isTruncStoreLegal(VT, StVT)) {
       if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
-        return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
-                               dl, Val, St->getBasePtr(),
-                               St->getMemoryVT(), St->getMemOperand(), DAG);
-      if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
-                                          DAG, dl))
+        return EmitTruncSStore(true /* Signed saturation */, St->getChain(), dl,
+                               Val, St->getBasePtr(), St->getMemoryVT(),
+                               St->getMemOperand(), DAG);
+      if (SDValue Val =
+              detectUSatPattern(St->getValue(), St->getMemoryVT(), DAG, dl))
         return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
-                               dl, Val, St->getBasePtr(),
-                               St->getMemoryVT(), St->getMemOperand(), DAG);
+                               dl, Val, St->getBasePtr(), St->getMemoryVT(),
+                               St->getMemOperand(), DAG);
     }
 
     return SDValue();
@@ -54228,23 +54311,14 @@ static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,
 // cases.
 static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG,
                                           const SDLoc &DL) {
-  assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
-  std::optional<uint64_t> ValidSrlConst = DAG.getValidShiftAmount(N);
-  if (!ValidSrlConst)
-    return SDValue();
-  uint64_t SrlConstVal = *ValidSrlConst;
 
   SDValue Op = N.getOperand(0);
+  APInt OpConst = Op.getConstantOperandAPInt(1);
+  APInt SrlConst = N.getConstantOperandAPInt(1);
+  uint64_t SrlConstVal = SrlConst.getZExtValue();
   unsigned Opcode = Op.getOpcode();
-  assert(VT == MVT::i32 && Op.getValueType() == MVT::i64 &&
-         "Illegal truncation types");
 
-  if ((Opcode != ISD::ADD && Opcode != ISD::OR && Opcode != ISD::XOR) ||
-      !isa<ConstantSDNode>(Op.getOperand(1)))
-    return SDValue();
-  const APInt &OpConst = Op.getConstantOperandAPInt(1);
-
-  if (SrlConstVal <= 32 ||
+  if (SrlConst.ule(32) ||
       (Opcode == ISD::ADD && OpConst.countr_zero() < SrlConstVal))
     return SDValue();
 
@@ -54252,14 +54326,13 @@ static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG,
       DAG.getNode(ISD::SRL, DL, MVT::i64, Op.getOperand(0), N.getOperand(1));
   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, OpLhsSrl);
 
-  APInt NewOpConstVal = OpConst.lshr(SrlConstVal).trunc(VT.getSizeInBits());
+  APInt NewOpConstVal = OpConst.lshr(SrlConst).trunc(VT.getSizeInBits());
   SDValue NewOpConst = DAG.getConstant(NewOpConstVal, DL, VT);
   SDValue NewOpNode = DAG.getNode(Opcode, DL, VT, Trunc, NewOpConst);
+  EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
 
-  if (Opcode == ISD::ADD) {
-    EVT CleanUpVT = EVT::getIntegerVT(*DAG.getContext(), 64 - SrlConstVal);
+  if (Opcode == ISD::ADD)
     return DAG.getZeroExtendInReg(NewOpNode, DL, CleanUpVT);
-  }
   return NewOpNode;
 }
 
@@ -54308,8 +54381,20 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
   if (!Src.hasOneUse())
     return SDValue();
 
-  if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL)
-    return combinei64TruncSrlConstant(Src, VT, DAG, DL);
+  if (VT == MVT::i32 && SrcVT == MVT::i64 && SrcOpcode == ISD::SRL &&
+      isa<ConstantSDNode>(Src.getOperand(1))) {
+
+    unsigned SrcOpOpcode = Src.getOperand(0).getOpcode();
+    if ((SrcOpOpcode != ISD::ADD && SrcOpOpcode != ISD::OR &&
+         SrcOpOpcode != ISD::XOR) ||
+        !isa<ConstantSDNode>(Src.getOperand(0).getOperand(1)))
+      return SDValue();
+
+    if (SDValue R = combinei64TruncSrlConstant(Src, VT, DAG, DL))
+      return R;
+
+    return SDValue();
+  }
 
   if (!VT.isVector())
     return SDValue();
@@ -54430,8 +54515,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
 //                 (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
-                               const X86Subtarget &Subtarget,
-                               const SDLoc &DL) {
+                               const X86Subtarget &Subtarget, const SDLoc &DL) {
   if (!VT.isVector() || !Subtarget.hasSSSE3())
     return SDValue();
 
@@ -54527,8 +54611,8 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
       std::swap(IdxN01, IdxN11);
     }
     // N0 indices be the even element. N1 indices must be the next odd element.
-    if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
-        IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
+    if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
+        IdxN11 != 2 * i + 1)
       return SDValue();
     SDValue N00In = N00Elt.getOperand(0);
     SDValue N01In = N01Elt.getOperand(0);
@@ -54539,8 +54623,8 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
       ZExtIn = N00In;
       SExtIn = N01In;
     }
-    if (ZExtIn != N00In || SExtIn != N01In ||
-        ZExtIn != N10In || SExtIn != N11In)
+    if (ZExtIn != N00In || SExtIn != N01In || ZExtIn != N10In ||
+        SExtIn != N11In)
       return SDValue();
   }
 
@@ -54560,14 +54644,13 @@ static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
     // Shrink by adding truncate nodes and let DAGCombine fold with the
     // sources.
     EVT InVT = Ops[0].getValueType();
-    assert(InVT.getScalarType() == MVT::i8 &&
-           "Unexpected scalar element type");
+    assert(InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type");
     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
                                  InVT.getVectorNumElements() / 2);
     return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
   };
-  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, {ZExtIn, SExtIn},
                           PMADDBuilder);
 }
 
@@ -54715,7 +54798,7 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
                                 bool NegRes) {
   if (NegMul) {
     switch (Opcode) {
-    // clang-format off
+      // clang-format off
     default: llvm_unreachable("Unexpected opcode");
     case ISD::FMA:              Opcode = X86ISD::FNMADD;        break;
     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FNMADD; break;
@@ -54729,13 +54812,13 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
     case X86ISD::FNMSUB:        Opcode = X86ISD::FMSUB;         break;
     case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB;  break;
     case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FMSUB_RND;     break;
-    // clang-format on
+      // clang-format on
     }
   }
 
   if (NegAcc) {
     switch (Opcode) {
-    // clang-format off
+      // clang-format off
     default: llvm_unreachable("Unexpected opcode");
     case ISD::FMA:              Opcode = X86ISD::FMSUB;         break;
     case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FMSUB;  break;
@@ -54753,7 +54836,7 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
     case X86ISD::FMADDSUB_RND:  Opcode = X86ISD::FMSUBADD_RND;  break;
     case X86ISD::FMSUBADD:      Opcode = X86ISD::FMADDSUB;      break;
     case X86ISD::FMSUBADD_RND:  Opcode = X86ISD::FMADDSUB_RND;  break;
-    // clang-format on
+      // clang-format on
     }
   }
 
@@ -54770,7 +54853,7 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
     case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMSUB_RND;    break;
     case X86ISD::FNMSUB:       Opcode = ISD::FMA;             break;
     case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMADD_RND;    break;
-    // clang-format on
+      // clang-format on
     }
   }
 
@@ -54906,19 +54989,18 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
   SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
   unsigned IntOpcode;
   switch (N->getOpcode()) {
-  // clang-format off
+    // clang-format off
   default: llvm_unreachable("Unexpected FP logic op");
   case X86ISD::FOR:   IntOpcode = ISD::OR; break;
   case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;
   case X86ISD::FAND:  IntOpcode = ISD::AND; break;
   case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
-  // clang-format on
+    // clang-format on
   }
   SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
   return DAG.getBitcast(VT, IntOp);
 }
 
-
 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
   if (N->getOpcode() != ISD::XOR)
@@ -55266,13 +55348,18 @@ static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
   // into FMINC and FMAXC, which are Commutative operations.
   unsigned NewOp = 0;
   switch (N->getOpcode()) {
-    default: llvm_unreachable("unknown opcode");
-    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
-    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
+  default:
+    llvm_unreachable("unknown opcode");
+  case X86ISD::FMIN:
+    NewOp = X86ISD::FMINC;
+    break;
+  case X86ISD::FMAX:
+    NewOp = X86ISD::FMAXC;
+    break;
   }
 
-  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
-                     N->getOperand(0), N->getOperand(1));
+  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), N->getOperand(0),
+                     N->getOperand(1));
 }
 
 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
@@ -55311,8 +55398,8 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
     return SDValue();
 
-  EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
-                                         VT);
+  EVT SetCCType =
+      TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 
   // There are 4 possibilities involving NaN inputs, and these are the required
   // outputs:
@@ -55362,8 +55449,8 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
     MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
     if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
       SDLoc dl(N);
-      SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
-                                    DAG.getBitcast(InVT, VZLoad));
+      SDValue Convert =
+          DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
       DCI.CombineTo(N, Convert);
       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
       DCI.recursivelyDeleteUnusedNodes(LN);
@@ -55856,8 +55943,8 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
 
   // Only combine legal element types.
   EVT SVT = VT.getVectorElementType();
-  if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
-      SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
+  if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 && SVT != MVT::i64 &&
+      SVT != MVT::f32 && SVT != MVT::f64)
     return SDValue();
 
   // We don't have CMPP Instruction for vxf16
@@ -55897,16 +55984,15 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   SDLoc DL(N);
 
   // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
-  if (!DCI.isBeforeLegalizeOps() &&
-      N0.getOpcode() == X86ISD::SETCC_CARRY) {
+  if (!DCI.isBeforeLegalizeOps() && N0.getOpcode() == X86ISD::SETCC_CARRY) {
     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
-                                 N0->getOperand(1));
+                                N0->getOperand(1));
     bool ReplaceOtherUses = !N0.hasOneUse();
     DCI.CombineTo(N, Setcc);
     // Replace other uses with a truncate of the widened setcc_carry.
     if (ReplaceOtherUses) {
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
-                                  N0.getValueType(), Setcc);
+      SDValue Trunc =
+          DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), Setcc);
       DCI.CombineTo(N0.getNode(), Trunc);
     }
 
@@ -56199,13 +56285,13 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
       N0.getOpcode() == X86ISD::SETCC_CARRY) {
     SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
-                                 N0->getOperand(1));
+                                N0->getOperand(1));
     bool ReplaceOtherUses = !N0.hasOneUse();
     DCI.CombineTo(N, Setcc);
     // Replace other uses with a truncate of the widened setcc_carry.
     if (ReplaceOtherUses) {
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
-                                  N0.getValueType(), Setcc);
+      SDValue Trunc =
+          DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), Setcc);
       DCI.CombineTo(N0.getNode(), Trunc);
     }
 
@@ -56441,8 +56527,8 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
           if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
             SDValue BaseOp = LHS.getOperand(0);
             SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
-            SDValue SETCC1 = DAG.getSetCC(
-                DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
+            SDValue SETCC1 = DAG.getSetCC(DL, VT, BaseOp,
+                                          DAG.getConstant(-CInt, DL, OpVT), CC);
             return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
                                SETCC0, SETCC1);
           }
@@ -56802,19 +56888,25 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
   SDLoc DL(GorS);
 
   if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
-    SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
-                      Gather->getMask(), Base, Index, Scale } ;
-    return DAG.getMaskedGather(Gather->getVTList(),
-                               Gather->getMemoryVT(), DL, Ops,
-                               Gather->getMemOperand(),
+    SDValue Ops[] = {Gather->getChain(),
+                     Gather->getPassThru(),
+                     Gather->getMask(),
+                     Base,
+                     Index,
+                     Scale};
+    return DAG.getMaskedGather(Gather->getVTList(), Gather->getMemoryVT(), DL,
+                               Ops, Gather->getMemOperand(),
                                Gather->getIndexType(),
                                Gather->getExtensionType());
   }
   auto *Scatter = cast<MaskedScatterSDNode>(GorS);
-  SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
-                    Scatter->getMask(), Base, Index, Scale };
-  return DAG.getMaskedScatter(Scatter->getVTList(),
-                              Scatter->getMemoryVT(), DL,
+  SDValue Ops[] = {Scatter->getChain(),
+                   Scatter->getValue(),
+                   Scatter->getMask(),
+                   Base,
+                   Index,
+                   Scale};
+  return DAG.getMaskedScatter(Scatter->getVTList(), Scatter->getMemoryVT(), DL,
                               Ops, Scatter->getMemOperand(),
                               Scatter->getIndexType(),
                               Scatter->isTruncatingStore());
@@ -57045,8 +57137,8 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
       SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
     // The AND node needs bitcasts to/from an integer vector type around it.
     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
-    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
-                                 MaskConst);
+    SDValue NewAnd =
+        DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0), MaskConst);
     SDValue Res = DAG.getBitcast(VT, NewAnd);
     if (IsStrict)
       return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
@@ -57232,8 +57324,8 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
       // use CVTSI2P.
       assert(InVT == MVT::v2i64 && "Unexpected VT!");
       SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
-      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
-                                          { 0, 2, -1, -1 });
+      SDValue Shuf =
+          DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast, {0, 2, -1, -1});
       if (IsStrict)
         return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
                            {N->getOperand(0), Shuf});
@@ -57334,7 +57426,7 @@ static bool needCarryOrOverflowFlag(SDValue Flags) {
     }
 
     switch (CC) {
-    // clang-format off
+      // clang-format off
     default: break;
     case X86::COND_A: case X86::COND_AE:
     case X86::COND_B: case X86::COND_BE:
@@ -57342,7 +57434,7 @@ static bool needCarryOrOverflowFlag(SDValue Flags) {
     case X86::COND_G: case X86::COND_GE:
     case X86::COND_L: case X86::COND_LE:
       return true;
-    // clang-format on
+      // clang-format on
     }
   }
 
@@ -57478,11 +57570,12 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG,
 
   // After this the truncate and arithmetic op must have a single use.
   if (!Trunc.hasOneUse() || !Op.hasOneUse())
-      return SDValue();
+    return SDValue();
 
   unsigned NewOpc;
   switch (Op.getOpcode()) {
-  default: return SDValue();
+  default:
+    return SDValue();
   case ISD::AND:
     // Skip and with constant. We have special handling for and with immediate
     // during isel to generate test instructions.
@@ -57490,8 +57583,12 @@ static SDValue combineCMP(SDNode *N, SelectionDAG &DAG,
       return SDValue();
     NewOpc = X86ISD::AND;
     break;
-  case ISD::OR:  NewOpc = X86ISD::OR;  break;
-  case ISD::XOR: NewOpc = X86ISD::XOR; break;
+  case ISD::OR:
+    NewOpc = X86ISD::OR;
+    break;
+  case ISD::XOR:
+    NewOpc = X86ISD::XOR;
+    break;
   case ISD::ADD:
     // If the carry or overflow flag is used, we can't truncate.
     if (needCarryOrOverflowFlag(SDValue(N, 0)))
@@ -57651,9 +57748,8 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
-                            const SDLoc &DL, EVT VT,
-                            const X86Subtarget &Subtarget) {
+static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N, const SDLoc &DL,
+                            EVT VT, const X86Subtarget &Subtarget) {
   using namespace SDPatternMatch;
 
   // Example of pattern we try to detect:
@@ -57761,9 +57857,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
 // Attempt to turn this pattern into PMADDWD.
 // (add (mul (sext (build_vector)), (sext (build_vector))),
 //      (mul (sext (build_vector)), (sext (build_vector)))
-static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N,
-                              const SDLoc &DL, EVT VT,
-                              const X86Subtarget &Subtarget) {
+static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N, const SDLoc &DL,
+                              EVT VT, const X86Subtarget &Subtarget) {
   using namespace SDPatternMatch;
 
   if (!Subtarget.hasSSE2())
@@ -57859,7 +57954,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N,
   // If the output is narrower than an input, extract the low part of the input
   // vector.
   EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
-                               VT.getVectorNumElements() * 2);
+                                 VT.getVectorNumElements() * 2);
   if (OutVT16.bitsLT(In0.getValueType())) {
     In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
                       DAG.getVectorIdxConstant(0, DL));
@@ -57868,8 +57963,7 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N,
     In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
                       DAG.getVectorIdxConstant(0, DL));
   }
-  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
-                          PMADDBuilder);
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, {In0, In1}, PMADDBuilder);
 }
 
 // ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
@@ -58750,8 +58844,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
         unsigned Imm1 = Ops[1].getConstantOperandVal(2);
         // TODO: Handle zero'd subvectors.
         if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
-          int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
-                         (int)((Imm1 >> 4) & 0x3)};
+          int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3),
+                         (int)(Imm1 & 0x03), (int)((Imm1 >> 4) & 0x3)};
           MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
           SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
                                          Ops[0].getOperand(1), DAG, DL);
@@ -58937,8 +59031,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
             break;
         }
 
-        ISD::CondCode ICC =
-            Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
+        ISD::CondCode ICC = Opcode == X86ISD::PCMPEQ ? ISD::SETEQ : ISD::SETGT;
         ISD::CondCode FCC =
             Opcode == X86ISD::PCMPEQ ? ISD::SETOEQ : ISD::SETOGT;
 
@@ -59240,7 +59333,8 @@ static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,
     APInt Constant = APInt::getZero(VT.getSizeInBits());
     for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
       auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
-      if (!C) break;
+      if (!C)
+        break;
       Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
       if (I == (E - 1)) {
         EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
@@ -59313,9 +59407,9 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
           ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
           Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
               SubVecVT.getFixedSizeInBits())
-          return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
-                             getZeroVector(OpVT, Subtarget, DAG, dl),
-                             Ins.getOperand(1), N->getOperand(2));
+        return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+                           getZeroVector(OpVT, Subtarget, DAG, dl),
+                           Ins.getOperand(1), N->getOperand(2));
     }
   }
 
@@ -59549,8 +59643,7 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc DL(N);
 
-  if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
-      TLI.isTypeLegal(InVecVT) &&
+  if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
       InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
     auto isConcatenatedNot = [](SDValue V) {
       V = peekThroughBitcasts(V);
@@ -60007,7 +60100,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
       LHS.getOperand(0).getValueType() == MVT::v4i32) {
     SDLoc dl(N);
     LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
-                               LHS.getOperand(0), { 0, -1, 1, -1 });
+                               LHS.getOperand(0), {0, -1, 1, -1});
     LHS = DAG.getBitcast(MVT::v2i64, LHS);
     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
   }
@@ -60017,7 +60110,7 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
       RHS.getOperand(0).getValueType() == MVT::v4i32) {
     SDLoc dl(N);
     RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
-                               RHS.getOperand(0), { 0, -1, 1, -1 });
+                               RHS.getOperand(0), {0, -1, 1, -1});
     RHS = DAG.getBitcast(MVT::v2i64, RHS);
     return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
   }
@@ -60253,16 +60346,16 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
   // Widen to at least 8 input elements.
   if (NumElts < 8) {
     unsigned NumConcats = 8 / NumElts;
-    SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
-                                : DAG.getConstant(0, dl, IntVT);
+    SDValue Fill =
+        NumElts == 4 ? DAG.getUNDEF(IntVT) : DAG.getConstant(0, dl, IntVT);
     SmallVector<SDValue, 4> Ops(NumConcats, Fill);
     Ops[0] = Src;
     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
   }
 
   // Destination is vXf32 with at least 4 elements.
-  EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
-                               std::max(4U, NumElts));
+  EVT CvtVT =
+      EVT::getVectorVT(*DAG.getContext(), MVT::f32, std::max(4U, NumElts));
   SDValue Cvt, Chain;
   if (IsStrict) {
     Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
@@ -60532,7 +60625,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
-  // clang-format off
+    // clang-format off
   default: break;
   case ISD::SCALAR_TO_VECTOR:
     return combineSCALAR_TO_VECTOR(N, DAG, Subtarget);
@@ -60881,7 +60974,8 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
 
   bool Commute = false;
   switch (Op.getOpcode()) {
-  default: return false;
+  default:
+    return false;
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
@@ -60921,8 +61015,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
         ((Commute && !isa<ConstantSDNode>(N1)) ||
          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
       return false;
-    if (IsFoldableAtomicRMW(N0, Op) ||
-        (Commute && IsFoldableAtomicRMW(N1, Op)))
+    if (IsFoldableAtomicRMW(N0, Op) || (Commute && IsFoldableAtomicRMW(N1, Op)))
       return false;
   }
   }
@@ -60984,7 +61077,8 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   SplitString(AsmStr, AsmPieces, ";\n");
 
   switch (AsmPieces.size()) {
-  default: return false;
+  default:
+    return false;
   case 1:
     // FIXME: this should verify that we are targeting a 486 or better.  If not,
     // we will turn this bswap into something that will be lowered to logical
@@ -61031,9 +61125,9 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
 
     if (CI->getType()->isIntegerTy(64)) {
       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
-      if (Constraints.size() >= 2 &&
-          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
-          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+      if (Constraints.size() >= 2 && Constraints[0].Codes.size() == 1 &&
+          Constraints[0].Codes[0] == "A" && Constraints[1].Codes.size() == 1 &&
+          Constraints[1].Codes[0] == "0") {
         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
@@ -61120,8 +61214,7 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
     default:
       break;
     }
-  }
-  else if (Constraint.size() == 2) {
+  } else if (Constraint.size() == 2) {
     switch (Constraint[0]) {
     default:
       break;
@@ -61310,8 +61403,7 @@ X86TargetLowering::getSingleConstraintMatchWeight(
 /// Try to replace an X constraint, which matches anything, with another that
 /// has more specific requirements based on the type of the corresponding
 /// operand.
-const char *X86TargetLowering::
-LowerXConstraint(EVT ConstraintVT) const {
+const char *X86TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   // 'f' like normal targets.
   if (ConstraintVT.isFloatingPoint()) {
@@ -61357,7 +61449,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   SDValue Result;
   char ConstraintLetter = Constraint[0];
   switch (ConstraintLetter) {
-  default: break;
+  default:
+    break;
   case 'I':
     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 31) {
@@ -61431,8 +61524,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
         break;
       }
-    // FIXME gcc accepts some relocatable values here too, but only in certain
-    // memory models; it's complicated.
+      // FIXME gcc accepts some relocatable values here too, but only in certain
+      // memory models; it's complicated.
     }
     return;
   }
@@ -61475,8 +61568,8 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
       bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
       BooleanContent BCont = getBooleanContents(MVT::i64);
-      ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
-                                    : ISD::SIGN_EXTEND;
+      ISD::NodeType ExtOpc =
+          IsBool ? getExtendForContent(BCont) : ISD::SIGN_EXTEND;
       int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
                                                   : CST->getSExtValue();
       Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
@@ -61555,7 +61648,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   if (Constraint.size() == 1) {
     // GCC Constraint Letters
     switch (Constraint[0]) {
-    default: break;
+    default:
+      break;
     // 'A' means [ER]AX + [ER]DX.
     case 'A':
       if (Subtarget.is64Bit())
@@ -61583,7 +61677,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
           return std::make_pair(0U, &X86::VK64RegClass);
       }
       break;
-    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+    case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
       if (Subtarget.is64Bit()) {
         if (VT == MVT::i8 || VT == MVT::i1)
           return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
@@ -61605,7 +61699,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       }
       [[fallthrough]];
       // 32-bit fallthrough
-    case 'Q':   // Q_REGS
+    case 'Q': // Q_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
       if (VT == MVT::i16)
@@ -61616,8 +61710,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       if (VT != MVT::f80 && !VT.isVector())
         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
       break;
-    case 'r':   // GENERAL_REGS
-    case 'l':   // INDEX_REGS
+    case 'r': // GENERAL_REGS
+    case 'l': // INDEX_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, useEGPRInlineAsm(Subtarget)
                                       ? &X86::GR8RegClass
@@ -61636,7 +61730,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                       ? &X86::GR64RegClass
                                       : &X86::GR64_NOREX2RegClass);
       break;
-    case 'R':   // LEGACY_REGS
+    case 'R': // LEGACY_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
       if (VT == MVT::i16)
@@ -61647,7 +61741,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       if (VT != MVT::f80 && !VT.isVector())
         return std::make_pair(0U, &X86::GR64_NOREXRegClass);
       break;
-    case 'f':  // FP Stack registers.
+    case 'f': // FP Stack registers.
       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
       // value to the correct fpstack register class.
       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
@@ -61657,16 +61751,19 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
         return std::make_pair(0U, &X86::RFP80RegClass);
       break;
-    case 'y':   // MMX_REGS if MMX allowed.
-      if (!Subtarget.hasMMX()) break;
+    case 'y': // MMX_REGS if MMX allowed.
+      if (!Subtarget.hasMMX())
+        break;
       return std::make_pair(0U, &X86::VR64RegClass);
     case 'v':
-    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
-      if (!Subtarget.hasSSE1()) break;
+    case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+      if (!Subtarget.hasSSE1())
+        break;
       bool VConstraint = (Constraint[0] == 'v');
 
       switch (VT.SimpleTy) {
-      default: break;
+      default:
+        break;
       // Scalar SSE types.
       case MVT::f16:
         if (VConstraint && Subtarget.hasFP16())
@@ -61754,7 +61851,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       case MVT::v16f32:
       case MVT::v16i32:
       case MVT::v8i64:
-        if (!Subtarget.hasAVX512()) break;
+        if (!Subtarget.hasAVX512())
+          break;
         if (VConstraint)
           return std::make_pair(0U, &X86::VR512RegClass);
         return std::make_pair(0U, &X86::VR512_0_15RegClass);
@@ -61770,12 +61868,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     case '2':
       return getRegForInlineAsmConstraint(TRI, "x", VT);
     case 'm':
-      if (!Subtarget.hasMMX()) break;
+      if (!Subtarget.hasMMX())
+        break;
       return std::make_pair(0U, &X86::VR64RegClass);
     case 'z':
-      if (!Subtarget.hasSSE1()) break;
+      if (!Subtarget.hasSSE1())
+        break;
       switch (VT.SimpleTy) {
-      default: break;
+      default:
+        break;
       // Scalar SSE types.
       case MVT::f16:
         if (!Subtarget.hasFP16())
@@ -61890,14 +61991,15 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
 
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
-  std::pair<Register, const TargetRegisterClass*> Res;
+  std::pair<Register, const TargetRegisterClass *> Res;
   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 
   // Not found as a standard register?
   if (!Res.second) {
     // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
     // to/from f80.
-    if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
+    if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 ||
+        VT == MVT::f80) {
       // Map st(0) -> st(7) -> ST0
       if (Constraint.size() == 7 && Constraint[0] == '{' &&
           tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
@@ -61955,7 +62057,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   // turn into {ax},{dx}.
   // MVT::Other is used to specify clobber names.
   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
-    return Res;   // Correct type already, nothing to do.
+    return Res; // Correct type already, nothing to do.
 
   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
   // return "eax". This should even work for things like getting 64bit integer
@@ -61967,7 +62069,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   // Therefore, use a helper method.
   if (isGRClass(*Class)) {
     unsigned Size = VT.getSizeInBits();
-    if (Size == 1) Size = 8;
+    if (Size == 1)
+      Size = 8;
     if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
       return std::make_pair(0, nullptr);
     Register DestReg = getX86SubSuperRegister(Res.first, Size);
@@ -61975,9 +62078,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       bool is64Bit = Subtarget.is64Bit();
       const TargetRegisterClass *RC =
           Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
-        : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
-        : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
-        : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
+          : Size == 16
+              ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
+          : Size == 32
+              ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
+              : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
       if (Size == 64 && !is64Bit) {
         // Model GCC's behavior here and select a fixed pair of 32-bit
         // registers.
@@ -62229,8 +62334,7 @@ X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {
   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
 }
 
-unsigned
-X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
+unsigned X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
   // The default stack probe size is 4096 if the function has no stackprobesize
   // attribute.
   return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
diff --git a/llvm/test/CodeGen/X86/ctlz-gfni.ll b/llvm/test/CodeGen/X86/ctlz-gfni.ll
new file mode 100644
index 0000000000000..d942f5ad3506f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ctlz-gfni.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=x86_64 -mattr=+gfni,+ssse3 < %s | FileCheck %s
+
+define <16 x i8> @test_ctlz_gfni(<16 x i8> %x) {
+; CHECK-LABEL: @test_ctlz_gfni
+; CHECK: gf2p8affineqb
+; CHECK: paddb
+; CHECK: pandn
+; CHECK: gf2p8affineqb
+; CHECK: ret
+  %r = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %x, i1 false)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1)
+
diff --git a/llvm/test/CodeGen/X86/gfni-lzcnt.ll b/llvm/test/CodeGen/X86/gfni-lzcnt.ll
index 8e48950c32cd8..f4dd1d1b77ea9 100644
--- a/llvm/test/CodeGen/X86/gfni-lzcnt.ll
+++ b/llvm/test/CodeGen/X86/gfni-lzcnt.ll
@@ -8,40 +8,44 @@
 define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; GFNISSE-LABEL: testv16i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
-; GFNISSE-NEXT:    pshufb %xmm0, %xmm2
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    pxor %xmm3, %xmm3
-; GFNISSE-NEXT:    pcmpeqb %xmm0, %xmm3
-; GFNISSE-NEXT:    pand %xmm2, %xmm3
-; GFNISSE-NEXT:    pshufb %xmm0, %xmm1
-; GFNISSE-NEXT:    paddb %xmm3, %xmm1
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; GFNISSE-NEXT:    paddb %xmm0, %xmm1
+; GFNISSE-NEXT:    pandn %xmm0, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm2, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX1OR2-LABEL: testv16i8:
-; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX1OR2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX1OR2-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1OR2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
-; GFNIAVX1OR2-NEXT:    retq
+; GFNIAVX1-LABEL: testv16i8:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX1-NEXT:    # xmm1 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
+; GFNIAVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $8, %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: testv16i8:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; GFNIAVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
+; GFNIAVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $8, %xmm1, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: testv16i8:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; GFNIAVX512-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX512-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
-; GFNIAVX512-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; GFNIAVX512-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; GFNIAVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; GFNIAVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; GFNIAVX512-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
+; GFNIAVX512-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $8, %xmm1, %xmm0, %xmm0
 ; GFNIAVX512-NEXT:    retq
   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0)
   ret <16 x i8> %out
@@ -50,40 +54,44 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ; GFNISSE-LABEL: testv16i8u:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNISSE-NEXT:    movdqa %xmm1, %xmm2
-; GFNISSE-NEXT:    pshufb %xmm0, %xmm2
-; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; GFNISSE-NEXT:    pxor %xmm3, %xmm3
-; GFNISSE-NEXT:    pcmpeqb %xmm0, %xmm3
-; GFNISSE-NEXT:    pand %xmm2, %xmm3
-; GFNISSE-NEXT:    pshufb %xmm0, %xmm1
-; GFNISSE-NEXT:    paddb %xmm3, %xmm1
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
+; GFNISSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; GFNISSE-NEXT:    paddb %xmm0, %xmm1
+; GFNISSE-NEXT:    pandn %xmm0, %xmm1
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm2, %xmm1
 ; GFNISSE-NEXT:    movdqa %xmm1, %xmm0
 ; GFNISSE-NEXT:    retq
 ;
-; GFNIAVX1OR2-LABEL: testv16i8u:
-; GFNIAVX1OR2:       # %bb.0:
-; GFNIAVX1OR2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX1OR2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; GFNIAVX1OR2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; GFNIAVX1OR2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX1OR2-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
-; GFNIAVX1OR2-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX1OR2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; GFNIAVX1OR2-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
-; GFNIAVX1OR2-NEXT:    retq
+; GFNIAVX1-LABEL: testv16i8u:
+; GFNIAVX1:       # %bb.0:
+; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX1-NEXT:    # xmm1 = mem[0,0]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
+; GFNIAVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $8, %xmm1, %xmm0, %xmm0
+; GFNIAVX1-NEXT:    retq
+;
+; GFNIAVX2-LABEL: testv16i8u:
+; GFNIAVX2:       # %bb.0:
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; GFNIAVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
+; GFNIAVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $8, %xmm1, %xmm0, %xmm0
+; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: testv16i8u:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; GFNIAVX512-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX512-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
-; GFNIAVX512-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; GFNIAVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; GFNIAVX512-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
+; GFNIAVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, %xmm1, %xmm0, %xmm0
+; GFNIAVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; GFNIAVX512-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
+; GFNIAVX512-NEXT:    vpandn %xmm0, %xmm2, %xmm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $8, %xmm1, %xmm0, %xmm0
 ; GFNIAVX512-NEXT:    retq
   %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1)
   ret <16 x i8> %out
@@ -92,73 +100,52 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; GFNISSE-LABEL: testv32i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
-; GFNISSE-NEXT:    pshufb %xmm0, %xmm3
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
-; GFNISSE-NEXT:    pxor %xmm5, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm6
-; GFNISSE-NEXT:    pshufb %xmm0, %xmm6
-; GFNISSE-NEXT:    pcmpeqb %xmm5, %xmm0
-; GFNISSE-NEXT:    pand %xmm3, %xmm0
-; GFNISSE-NEXT:    paddb %xmm6, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
-; GFNISSE-NEXT:    pshufb %xmm1, %xmm3
+; GFNISSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm3
+; GFNISSE-NEXT:    paddb %xmm2, %xmm3
+; GFNISSE-NEXT:    pandn %xmm0, %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm4, %xmm3
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
-; GFNISSE-NEXT:    pcmpeqb %xmm1, %xmm5
-; GFNISSE-NEXT:    pand %xmm3, %xmm5
-; GFNISSE-NEXT:    pshufb %xmm1, %xmm2
-; GFNISSE-NEXT:    paddb %xmm5, %xmm2
+; GFNISSE-NEXT:    paddb %xmm1, %xmm2
+; GFNISSE-NEXT:    pandn %xmm1, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm4, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: testv32i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; GFNIAVX1-NEXT:    vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm6
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; GFNIAVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; GFNIAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm3
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; GFNIAVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $8, %ymm1, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: testv32i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX2-NEXT:    # ymm1 = mem[0,1,0,1]
-; GFNIAVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX2-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $8, %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: testv32i8:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX512-NEXT:    # ymm1 = mem[0,1,0,1]
-; GFNIAVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; GFNIAVX512-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX512-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
-; GFNIAVX512-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; GFNIAVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; GFNIAVX512-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; GFNIAVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; GFNIAVX512-NEXT:    vpaddb %ymm2, %ymm0, %ymm2
+; GFNIAVX512-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $8, %ymm1, %ymm0, %ymm0
 ; GFNIAVX512-NEXT:    retq
   %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 0)
   ret <32 x i8> %out
@@ -167,73 +154,52 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; GFNISSE-LABEL: testv32i8u:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
-; GFNISSE-NEXT:    pshufb %xmm0, %xmm3
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
-; GFNISSE-NEXT:    pxor %xmm5, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm6
-; GFNISSE-NEXT:    pshufb %xmm0, %xmm6
-; GFNISSE-NEXT:    pcmpeqb %xmm5, %xmm0
-; GFNISSE-NEXT:    pand %xmm3, %xmm0
-; GFNISSE-NEXT:    paddb %xmm6, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm2, %xmm3
-; GFNISSE-NEXT:    pshufb %xmm1, %xmm3
+; GFNISSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm3
+; GFNISSE-NEXT:    paddb %xmm2, %xmm3
+; GFNISSE-NEXT:    pandn %xmm0, %xmm3
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm4, %xmm3
 ; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
-; GFNISSE-NEXT:    pcmpeqb %xmm1, %xmm5
-; GFNISSE-NEXT:    pand %xmm3, %xmm5
-; GFNISSE-NEXT:    pshufb %xmm1, %xmm2
-; GFNISSE-NEXT:    paddb %xmm5, %xmm2
+; GFNISSE-NEXT:    paddb %xmm1, %xmm2
+; GFNISSE-NEXT:    pandn %xmm1, %xmm2
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm4, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm3, %xmm0
 ; GFNISSE-NEXT:    movdqa %xmm2, %xmm1
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: testv32i8u:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; GFNIAVX1-NEXT:    vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNIAVX1-NEXT:    # xmm4 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; GFNIAVX1-NEXT:    vpcmpeqb %xmm5, %xmm1, %xmm6
-; GFNIAVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; GFNIAVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm4, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm4
-; GFNIAVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; GFNIAVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
-; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; GFNIAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
+; GFNIAVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm3
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; GFNIAVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $8, %ymm1, %ymm0, %ymm0
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: testv32i8u:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX2-NEXT:    # ymm1 = mem[0,1,0,1]
-; GFNIAVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX2-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
-; GFNIAVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; GFNIAVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; GFNIAVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm2
+; GFNIAVX2-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $8, %ymm1, %ymm0, %ymm0
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512-LABEL: testv32i8u:
 ; GFNIAVX512:       # %bb.0:
-; GFNIAVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX512-NEXT:    # ymm1 = mem[0,1,0,1]
-; GFNIAVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
-; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; GFNIAVX512-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; GFNIAVX512-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm3
-; GFNIAVX512-NEXT:    vpand %ymm3, %ymm2, %ymm2
-; GFNIAVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
-; GFNIAVX512-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
+; GFNIAVX512-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX512-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
+; GFNIAVX512-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; GFNIAVX512-NEXT:    vpaddb %ymm2, %ymm0, %ymm2
+; GFNIAVX512-NEXT:    vpandn %ymm0, %ymm2, %ymm0
+; GFNIAVX512-NEXT:    vgf2p8affineqb $8, %ymm1, %ymm0, %ymm0
 ; GFNIAVX512-NEXT:    retq
   %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 -1)
   ret <32 x i8> %out
@@ -242,130 +208,88 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; GFNISSE-LABEL: testv64i8:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm4
-; GFNISSE-NEXT:    movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm7
-; GFNISSE-NEXT:    pshufb %xmm0, %xmm7
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm0
-; GFNISSE-NEXT:    pxor %xmm5, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm8
-; GFNISSE-NEXT:    pshufb %xmm0, %xmm8
-; GFNISSE-NEXT:    pcmpeqb %xmm5, %xmm0
-; GFNISSE-NEXT:    pand %xmm7, %xmm0
-; GFNISSE-NEXT:    paddb %xmm8, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm7
-; GFNISSE-NEXT:    pshufb %xmm1, %xmm7
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm8
-; GFNISSE-NEXT:    pshufb %xmm1, %xmm8
-; GFNISSE-NEXT:    pcmpeqb %xmm5, %xmm1
-; GFNISSE-NEXT:    pand %xmm7, %xmm1
-; GFNISSE-NEXT:    paddb %xmm8, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm7
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm7
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm8
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm8
-; GFNISSE-NEXT:    pcmpeqb %xmm5, %xmm2
-; GFNISSE-NEXT:    pand %xmm7, %xmm2
-; GFNISSE-NEXT:    paddb %xmm8, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm7
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm7
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm4
-; GFNISSE-NEXT:    pcmpeqb %xmm4, %xmm5
-; GFNISSE-NEXT:    pand %xmm7, %xmm5
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
-; GFNISSE-NEXT:    paddb %xmm5, %xmm3
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm0
+; GFNISSE-NEXT:    pcmpeqd %xmm4, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm5
+; GFNISSE-NEXT:    paddb %xmm4, %xmm5
+; GFNISSE-NEXT:    pandn %xmm0, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm8, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm6
+; GFNISSE-NEXT:    paddb %xmm4, %xmm6
+; GFNISSE-NEXT:    pandn %xmm1, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm8, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm7
+; GFNISSE-NEXT:    paddb %xmm4, %xmm7
+; GFNISSE-NEXT:    pandn %xmm2, %xmm7
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm8, %xmm7
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm3
+; GFNISSE-NEXT:    paddb %xmm3, %xmm4
+; GFNISSE-NEXT:    pandn %xmm3, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm8, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm3
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: testv64i8:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vmovq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; GFNIAVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm4
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm4, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; GFNIAVX1-NEXT:    vpshufb %xmm1, %xmm3, %xmm4
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm5
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm4, %xmm1
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; GFNIAVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm5
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; GFNIAVX1-NEXT:    vandnps %ymm0, %ymm3, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $8, %ymm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; GFNIAVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; GFNIAVX1-NEXT:    vandnps %ymm1, %ymm3, %ymm1
+; GFNIAVX1-NEXT:    vgf2p8affineqb $8, %ymm2, %ymm1, %ymm1
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: testv64i8:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
-; GFNIAVX2-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; GFNIAVX2-NEXT:    vpcmpeqb %ymm5, %ymm0, %ymm6
-; GFNIAVX2-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; GFNIAVX2-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
-; GFNIAVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; GFNIAVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm4
+; GFNIAVX2-NEXT:    vpandn %ymm0, %ymm4, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $8, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm1, %ymm3
+; GFNIAVX2-NEXT:    vpandn %ymm1, %ymm3, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $8, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512VL-LABEL: testv64i8:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; GFNIAVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX512VL-NEXT:    # ymm2 = mem[0,1,0,1]
-; GFNIAVX512VL-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; GFNIAVX512VL-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
-; GFNIAVX512VL-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpcmpeqb %ymm5, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %zmm1, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; GFNIAVX512VL-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; GFNIAVX512VL-NEXT:    vpandnq %zmm0, %zmm2, %zmm0
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $8, %zmm1, %zmm0, %zmm0
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: testv64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
-; GFNIAVX512BW-NEXT:    vptestnmb %zmm1, %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; GFNIAVX512BW-NEXT:    vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z}
-; GFNIAVX512BW-NEXT:    vpshufb %zmm1, %zmm2, %zmm1
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; GFNIAVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT:    vpandnq %zmm0, %zmm2, %zmm0
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $8, %zmm1, %zmm0, %zmm0
 ; GFNIAVX512BW-NEXT:    retq
   %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 0)
   ret <64 x i8> %out
@@ -374,133 +298,92 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ; GFNISSE-LABEL: testv64i8u:
 ; GFNISSE:       # %bb.0:
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm4
-; GFNISSE-NEXT:    movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm7
-; GFNISSE-NEXT:    pshufb %xmm0, %xmm7
-; GFNISSE-NEXT:    movdqa {{.*#+}} xmm6 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm0
-; GFNISSE-NEXT:    pxor %xmm5, %xmm5
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm8
-; GFNISSE-NEXT:    pshufb %xmm0, %xmm8
-; GFNISSE-NEXT:    pcmpeqb %xmm5, %xmm0
-; GFNISSE-NEXT:    pand %xmm7, %xmm0
-; GFNISSE-NEXT:    paddb %xmm8, %xmm0
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm7
-; GFNISSE-NEXT:    pshufb %xmm1, %xmm7
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm8
-; GFNISSE-NEXT:    pshufb %xmm1, %xmm8
-; GFNISSE-NEXT:    pcmpeqb %xmm5, %xmm1
-; GFNISSE-NEXT:    pand %xmm7, %xmm1
-; GFNISSE-NEXT:    paddb %xmm8, %xmm1
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm7
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm7
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm8
-; GFNISSE-NEXT:    pshufb %xmm2, %xmm8
-; GFNISSE-NEXT:    pcmpeqb %xmm5, %xmm2
-; GFNISSE-NEXT:    pand %xmm7, %xmm2
-; GFNISSE-NEXT:    paddb %xmm8, %xmm2
-; GFNISSE-NEXT:    movdqa %xmm3, %xmm7
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm7
-; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm6, %xmm4
-; GFNISSE-NEXT:    pcmpeqb %xmm4, %xmm5
-; GFNISSE-NEXT:    pand %xmm7, %xmm5
-; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
-; GFNISSE-NEXT:    paddb %xmm5, %xmm3
+; GFNISSE-NEXT:    movdqa {{.*#+}} xmm8 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm0
+; GFNISSE-NEXT:    pcmpeqd %xmm4, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm0, %xmm5
+; GFNISSE-NEXT:    paddb %xmm4, %xmm5
+; GFNISSE-NEXT:    pandn %xmm0, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm8, %xmm5
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm1, %xmm6
+; GFNISSE-NEXT:    paddb %xmm4, %xmm6
+; GFNISSE-NEXT:    pandn %xmm1, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm8, %xmm6
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm2, %xmm7
+; GFNISSE-NEXT:    paddb %xmm4, %xmm7
+; GFNISSE-NEXT:    pandn %xmm2, %xmm7
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm8, %xmm7
+; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm8, %xmm3
+; GFNISSE-NEXT:    paddb %xmm3, %xmm4
+; GFNISSE-NEXT:    pandn %xmm3, %xmm4
+; GFNISSE-NEXT:    gf2p8affineqb $8, %xmm8, %xmm4
+; GFNISSE-NEXT:    movdqa %xmm5, %xmm0
+; GFNISSE-NEXT:    movdqa %xmm6, %xmm1
+; GFNISSE-NEXT:    movdqa %xmm7, %xmm2
+; GFNISSE-NEXT:    movdqa %xmm4, %xmm3
 ; GFNISSE-NEXT:    retq
 ;
 ; GFNIAVX1-LABEL: testv64i8u:
 ; GFNIAVX1:       # %bb.0:
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; GFNIAVX1-NEXT:    vmovq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
-; GFNIAVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNIAVX1-NEXT:    # xmm5 = mem[0,0]
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; GFNIAVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; GFNIAVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm4
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm0, %xmm0
-; GFNIAVX1-NEXT:    vpcmpeqb %xmm6, %xmm0, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; GFNIAVX1-NEXT:    vpaddb %xmm0, %xmm4, %xmm0
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm2, %xmm2
-; GFNIAVX1-NEXT:    vpcmpeqb %xmm6, %xmm2, %xmm7
-; GFNIAVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; GFNIAVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
-; GFNIAVX1-NEXT:    vpshufb %xmm1, %xmm3, %xmm4
-; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %xmm5, %xmm1, %xmm1
-; GFNIAVX1-NEXT:    vpcmpeqb %xmm6, %xmm1, %xmm5
-; GFNIAVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; GFNIAVX1-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; GFNIAVX1-NEXT:    vpaddb %xmm1, %xmm4, %xmm1
-; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; GFNIAVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
+; GFNIAVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpaddb %xmm4, %xmm0, %xmm5
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; GFNIAVX1-NEXT:    vandnps %ymm0, %ymm3, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $8, %ymm2, %ymm0, %ymm0
+; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
+; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; GFNIAVX1-NEXT:    vpaddb %xmm4, %xmm3, %xmm3
+; GFNIAVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm4
+; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; GFNIAVX1-NEXT:    vandnps %ymm1, %ymm3, %ymm1
+; GFNIAVX1-NEXT:    vgf2p8affineqb $8, %ymm2, %ymm1, %ymm1
 ; GFNIAVX1-NEXT:    retq
 ;
 ; GFNIAVX2-LABEL: testv64i8u:
 ; GFNIAVX2:       # %bb.0:
-; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
-; GFNIAVX2-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
-; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm0
-; GFNIAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; GFNIAVX2-NEXT:    vpcmpeqb %ymm5, %ymm0, %ymm6
-; GFNIAVX2-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; GFNIAVX2-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; GFNIAVX2-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
-; GFNIAVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
-; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm1
-; GFNIAVX2-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm4
-; GFNIAVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; GFNIAVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; GFNIAVX2-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
+; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm4
+; GFNIAVX2-NEXT:    vpandn %ymm0, %ymm4, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $8, %ymm2, %ymm0, %ymm0
+; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
+; GFNIAVX2-NEXT:    vpaddb %ymm3, %ymm1, %ymm3
+; GFNIAVX2-NEXT:    vpandn %ymm1, %ymm3, %ymm1
+; GFNIAVX2-NEXT:    vgf2p8affineqb $8, %ymm2, %ymm1, %ymm1
 ; GFNIAVX2-NEXT:    retq
 ;
 ; GFNIAVX512VL-LABEL: testv64i8u:
 ; GFNIAVX512VL:       # %bb.0:
-; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; GFNIAVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX512VL-NEXT:    # ymm2 = mem[0,1,0,1]
-; GFNIAVX512VL-NEXT:    vpshufb %ymm1, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16,0,0,0,0,128,64,32,16]
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm1, %ymm1
-; GFNIAVX512VL-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; GFNIAVX512VL-NEXT:    vpcmpeqb %ymm5, %ymm1, %ymm6
-; GFNIAVX512VL-NEXT:    vpand %ymm6, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
-; GFNIAVX512VL-NEXT:    vpaddb %ymm1, %ymm3, %ymm1
-; GFNIAVX512VL-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
-; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %ymm4, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpcmpeqb %ymm5, %ymm0, %ymm4
-; GFNIAVX512VL-NEXT:    vpand %ymm4, %ymm3, %ymm3
-; GFNIAVX512VL-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
-; GFNIAVX512VL-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
-; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $0, %zmm1, %zmm0, %zmm0
+; GFNIAVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; GFNIAVX512VL-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
+; GFNIAVX512VL-NEXT:    vpaddb %ymm3, %ymm0, %ymm3
+; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; GFNIAVX512VL-NEXT:    vpandnq %zmm0, %zmm2, %zmm0
+; GFNIAVX512VL-NEXT:    vgf2p8affineqb $8, %zmm1, %zmm0, %zmm0
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: testv64i8u:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1
-; GFNIAVX512BW-NEXT:    vptestnmb %zmm1, %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; GFNIAVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; GFNIAVX512BW-NEXT:    vpshufb %zmm0, %zmm2, %zmm0 {%k1} {z}
-; GFNIAVX512BW-NEXT:    vpshufb %zmm1, %zmm2, %zmm1
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; GFNIAVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT:    vpandnq %zmm0, %zmm2, %zmm0
+; GFNIAVX512BW-NEXT:    vgf2p8affineqb $8, %zmm1, %zmm0, %zmm0
 ; GFNIAVX512BW-NEXT:    retq
   %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 -1)
   ret <64 x i8> %out
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFNIAVX: {{.*}}
+; GFNIAVX1OR2: {{.*}}

>From 37707f26421f949558202458f05458da9ad31232 Mon Sep 17 00:00:00 2001
From: william <we3223 at gmail.com>
Date: Thu, 29 May 2025 11:11:04 +0800
Subject: [PATCH 2/2] Fix clang-format style issues

---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  1 -
 llvm/test/CodeGen/X86/ctlz-gfni.ll      | 11 ++++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7918a8e72adf6..6215b253374ab 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -29040,7 +29040,6 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc dl(Op);
   unsigned Opc = Op.getOpcode();
 
-  
   if (VT.isVector() && VT.getScalarType() == MVT::i8 && Subtarget.hasGFNI())
     return LowerVectorCTLZ_GFNI(Op, DAG, Subtarget);
 
diff --git a/llvm/test/CodeGen/X86/ctlz-gfni.ll b/llvm/test/CodeGen/X86/ctlz-gfni.ll
index d942f5ad3506f..40007d87a6f7c 100644
--- a/llvm/test/CodeGen/X86/ctlz-gfni.ll
+++ b/llvm/test/CodeGen/X86/ctlz-gfni.ll
@@ -2,11 +2,12 @@
 
 define <16 x i8> @test_ctlz_gfni(<16 x i8> %x) {
 ; CHECK-LABEL: @test_ctlz_gfni
-; CHECK: gf2p8affineqb
-; CHECK: paddb
-; CHECK: pandn
-; CHECK: gf2p8affineqb
-; CHECK: ret
+; CHECK:       vgf2p8affineqb
+; CHECK:       paddb
+; CHECK:       pandn
+; CHECK:       vgf2p8affineqb
+; CHECK:       ret
+
   %r = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %x, i1 false)
   ret <16 x i8> %r
 }