[llvm] [AArch64] Add support for custom MOVI and MVN (PR #148698)

Mon Jul 14 11:47:09 PDT 2025

https://github.com/aabhinavg1 updated https://github.com/llvm/llvm-project/pull/148698

>From 2d11ccc7fb2c16d86f911e15ea539d3ed95293a9 Mon Sep 17 00:00:00 2001
From: aabhinavg1 <tiwariabhinavak at gmail.com>
Date: Mon, 14 Jul 2025 23:59:41 +0530
Subject: [PATCH] [AArch64] Add support for custom MOVI and MVN

- Implemented custom pattern matching for MOVI and MVNI vector instructions.
- Added tests to verify MOVI.16b, MOVI.4s, MVNI.4s with shift and without shift.
- Ensured correct codegen for specific immediate constants using AArch64 ISel lowering.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 996 ++++++++++--------
 llvm/test/CodeGen/AArch64/movi-custom.ll      |  30 +
 2 files changed, 567 insertions(+), 459 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/movi-custom.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 55601e6327e98..0a99d3980c9fb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -119,20 +119,20 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
     cl::init(false));
 
 static cl::opt<bool>
-EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
-                         cl::desc("Enable AArch64 logical imm instruction "
-                                  "optimization"),
-                         cl::init(true));
+    EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
+                             cl::desc("Enable AArch64 logical imm instruction "
+                                      "optimization"),
+                             cl::init(true));
 
 // Temporary option added for the purpose of testing functionality added
 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
 // in future when both implementations will be based off MGATHER rather
 // than the GLD1 nodes added for the SVE gather load intrinsics.
 static cl::opt<bool>
-EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
-                                cl::desc("Combine extends of AArch64 masked "
-                                         "gather intrinsics"),
-                                cl::init(true));
+    EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
+                                   cl::desc("Combine extends of AArch64 masked "
+                                            "gather intrinsics"),
+                                   cl::init(true));
 
 static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
                                     cl::desc("Combine ext and trunc to TBL"),
@@ -840,53 +840,53 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
     // promote v4f16 to v4f32 when that is known to be safe.
     auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
-    setOperationPromotedToType(ISD::FADD,       V4Narrow, MVT::v4f32);
-    setOperationPromotedToType(ISD::FSUB,       V4Narrow, MVT::v4f32);
-    setOperationPromotedToType(ISD::FMUL,       V4Narrow, MVT::v4f32);
-    setOperationPromotedToType(ISD::FDIV,       V4Narrow, MVT::v4f32);
-    setOperationPromotedToType(ISD::FCEIL,      V4Narrow, MVT::v4f32);
-    setOperationPromotedToType(ISD::FFLOOR,     V4Narrow, MVT::v4f32);
-    setOperationPromotedToType(ISD::FROUND,     V4Narrow, MVT::v4f32);
-    setOperationPromotedToType(ISD::FTRUNC,     V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
     setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
-    setOperationPromotedToType(ISD::FRINT,      V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
     setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
     setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
-    setOperationPromotedToType(ISD::SETCC,         V4Narrow, MVT::v4f32);
+    setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
 
-    setOperationAction(ISD::FABS,        V4Narrow, Legal);
-    setOperationAction(ISD::FNEG,        V4Narrow, Legal);
-    setOperationAction(ISD::FMA,         V4Narrow, Expand);
-    setOperationAction(ISD::BR_CC,       V4Narrow, Expand);
-    setOperationAction(ISD::SELECT,      V4Narrow, Expand);
-    setOperationAction(ISD::SELECT_CC,   V4Narrow, Expand);
-    setOperationAction(ISD::FCOPYSIGN,   V4Narrow, Custom);
-    setOperationAction(ISD::FSQRT,       V4Narrow, Expand);
+    setOperationAction(ISD::FABS, V4Narrow, Legal);
+    setOperationAction(ISD::FNEG, V4Narrow, Legal);
+    setOperationAction(ISD::FMA, V4Narrow, Expand);
+    setOperationAction(ISD::BR_CC, V4Narrow, Expand);
+    setOperationAction(ISD::SELECT, V4Narrow, Expand);
+    setOperationAction(ISD::SELECT_CC, V4Narrow, Expand);
+    setOperationAction(ISD::FCOPYSIGN, V4Narrow, Custom);
+    setOperationAction(ISD::FSQRT, V4Narrow, Expand);
 
     auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
     setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
-    setOperationPromotedToType(ISD::SETCC,         V8Narrow, MVT::v8f32);
-
-    setOperationAction(ISD::FABS,        V8Narrow, Legal);
-    setOperationAction(ISD::FADD,        V8Narrow, Legal);
-    setOperationAction(ISD::FCEIL,       V8Narrow, Legal);
-    setOperationAction(ISD::FCOPYSIGN,   V8Narrow, Custom);
-    setOperationAction(ISD::FDIV,        V8Narrow, Legal);
-    setOperationAction(ISD::FFLOOR,      V8Narrow, Legal);
-    setOperationAction(ISD::FMA,         V8Narrow, Expand);
-    setOperationAction(ISD::FMUL,        V8Narrow, Legal);
-    setOperationAction(ISD::FNEARBYINT,  V8Narrow, Legal);
-    setOperationAction(ISD::FNEG,        V8Narrow, Legal);
-    setOperationAction(ISD::FROUND,      V8Narrow, Legal);
-    setOperationAction(ISD::FROUNDEVEN,  V8Narrow, Legal);
-    setOperationAction(ISD::FRINT,       V8Narrow, Legal);
-    setOperationAction(ISD::FSQRT,       V8Narrow, Expand);
-    setOperationAction(ISD::FSUB,        V8Narrow, Legal);
-    setOperationAction(ISD::FTRUNC,      V8Narrow, Legal);
-    setOperationAction(ISD::BR_CC,       V8Narrow, Expand);
-    setOperationAction(ISD::SELECT,      V8Narrow, Expand);
-    setOperationAction(ISD::SELECT_CC,   V8Narrow, Expand);
-    setOperationAction(ISD::FP_EXTEND,   V8Narrow, Expand);
+    setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
+
+    setOperationAction(ISD::FABS, V8Narrow, Legal);
+    setOperationAction(ISD::FADD, V8Narrow, Legal);
+    setOperationAction(ISD::FCEIL, V8Narrow, Legal);
+    setOperationAction(ISD::FCOPYSIGN, V8Narrow, Custom);
+    setOperationAction(ISD::FDIV, V8Narrow, Legal);
+    setOperationAction(ISD::FFLOOR, V8Narrow, Legal);
+    setOperationAction(ISD::FMA, V8Narrow, Expand);
+    setOperationAction(ISD::FMUL, V8Narrow, Legal);
+    setOperationAction(ISD::FNEARBYINT, V8Narrow, Legal);
+    setOperationAction(ISD::FNEG, V8Narrow, Legal);
+    setOperationAction(ISD::FROUND, V8Narrow, Legal);
+    setOperationAction(ISD::FROUNDEVEN, V8Narrow, Legal);
+    setOperationAction(ISD::FRINT, V8Narrow, Legal);
+    setOperationAction(ISD::FSQRT, V8Narrow, Expand);
+    setOperationAction(ISD::FSUB, V8Narrow, Legal);
+    setOperationAction(ISD::FTRUNC, V8Narrow, Legal);
+    setOperationAction(ISD::BR_CC, V8Narrow, Expand);
+    setOperationAction(ISD::SELECT, V8Narrow, Expand);
+    setOperationAction(ISD::SELECT_CC, V8Narrow, Expand);
+    setOperationAction(ISD::FP_EXTEND, V8Narrow, Expand);
   };
 
   if (!Subtarget->hasFullFP16()) {
@@ -1298,8 +1298,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
     }
 
-    setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
-    setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
+    setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
+    setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
     setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
     setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
     setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
@@ -1322,8 +1322,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::MUL, MVT::v1i64, Custom);
 
     // Saturates
-    for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
-                    MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+    for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64, MVT::v16i8,
+                   MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
       setOperationAction(ISD::SADDSAT, VT, Legal);
       setOperationAction(ISD::UADDSAT, VT, Legal);
       setOperationAction(ISD::SSUBSAT, VT, Legal);
@@ -1341,8 +1341,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
 
     // Vector reductions
-    for (MVT VT : { MVT::v4f16, MVT::v2f32,
-                    MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
+    for (MVT VT :
+         {MVT::v4f16, MVT::v2f32, MVT::v8f16, MVT::v4f32, MVT::v2f64}) {
       if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
         setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);
         setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);
@@ -1355,8 +1355,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     if (Subtarget->hasFullFP16())
       setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
 
-    for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
-                    MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+    for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
+                   MVT::v4i32}) {
       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
@@ -1432,18 +1432,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
     setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
 
-    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
-    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
 
     // ADDP custom lowering
-    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+    for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64})
       setOperationAction(ISD::ADD, VT, Custom);
     // FADDP custom lowering
-    for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
+    for (MVT VT : {MVT::v16f16, MVT::v8f32, MVT::v4f64})
       setOperationAction(ISD::FADD, VT, Custom);
 
     if (Subtarget->hasDotProd()) {
@@ -1601,8 +1601,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::BITCAST, VT, Custom);
 
     for (auto VT :
-         { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
-           MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
+         {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
+          MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16})
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
 
     for (auto VT :
@@ -1629,10 +1629,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
 
     // NEON doesn't support masked loads/stores, but SME and SVE do.
-    for (auto VT :
-         {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
-          MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
-          MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
+    for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
+                    MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
+                    MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
       setOperationAction(ISD::MLOAD, VT, Custom);
       setOperationAction(ISD::MSTORE, VT, Custom);
     }
@@ -1858,8 +1857,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
 
       // Int operations with no NEON support.
-      for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
-                      MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
+      for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
+                      MVT::v4i32, MVT::v2i64}) {
         setOperationAction(ISD::BITREVERSE, VT, Custom);
         setOperationAction(ISD::CTTZ, VT, Custom);
         setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
@@ -2075,8 +2074,7 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
 
   // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
   // NEON types.
-  if (VT.isFloatingPoint() &&
-      VT.getVectorElementType() != MVT::bf16 &&
+  if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::bf16 &&
       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
     for (unsigned Opcode :
          {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
@@ -2484,8 +2482,8 @@ static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
   if (NewImm == 0 || NewImm == OrigMask) {
     New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
                           TLO.DAG.getConstant(NewImm, DL, VT));
-  // Otherwise, create a machine node so that target independent DAG combine
-  // doesn't undo this optimization.
+    // Otherwise, create a machine node so that target independent DAG combine
+    // doesn't undo this optimization.
   } else {
     Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
     SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
@@ -2600,6 +2598,36 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
         APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
     break;
   }
+  case AArch64ISD::MOVIedit: {
+    if (Op.getNumOperands() < 2 || !isa<ConstantSDNode>(Op.getOperand(0)) ||
+        !isa<ConstantSDNode>(Op.getOperand(1))) {
+      break; // Or assert, or return
+    }
+    uint64_t Val = Op->getConstantOperandVal(0);
+    uint64_t Mask = Op->getConstantOperandVal(1);
+    Known = KnownBits::makeConstant(APInt(Known.getBitWidth(), Val | Mask));
+    break;
+  }
+  case AArch64ISD::MOVImsl: {
+    uint64_t Val = Op->getConstantOperandVal(0);
+    uint64_t Shift = Op->getConstantOperandVal(1);
+    Known = KnownBits::makeConstant(
+        APInt(Known.getBitWidth(), Val * (1ULL << Shift)));
+    break;
+  }
+  case AArch64ISD::MVNIshift: {
+    uint64_t Val = Op->getConstantOperandVal(0);
+    uint64_t Shift = Op->getConstantOperandVal(1);
+    Known = KnownBits::makeConstant(~APInt(Known.getBitWidth(), Val << Shift));
+    break;
+  }
+  case AArch64ISD::MVNImsl: {
+    uint64_t Val = Op->getConstantOperandVal(0);
+    uint64_t Shift = Op->getConstantOperandVal(1);
+    Known = KnownBits::makeConstant(
+        ~APInt(Known.getBitWidth(), Val * (1ULL << Shift)));
+    break;
+  }
   case AArch64ISD::LOADgot:
   case AArch64ISD::ADDlow: {
     if (!Subtarget->isTargetILP32())
@@ -2617,7 +2645,8 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
     Intrinsic::ID IntID =
         static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
     switch (IntID) {
-    default: return;
+    default:
+      return;
     case Intrinsic::aarch64_ldaxr:
     case Intrinsic::aarch64_ldxr: {
       unsigned BitWidth = Known.getBitWidth();
@@ -2639,7 +2668,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
       unsigned BitWidth = Known.getBitWidth();
       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
-        unsigned Bound = (VT == MVT::v8i8) ?  11 : 12;
+        unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
         assert(BitWidth >= Bound && "Unexpected width!");
         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - Bound);
         Known.Zero |= Mask;
@@ -2826,8 +2855,9 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
   return EndBB;
 }
 
-MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
-       MachineInstr &MI, MachineBasicBlock *BB) const {
+MachineBasicBlock *
+AArch64TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
+                                           MachineBasicBlock *BB) const {
   assert(!isAsynchronousEHPersonality(classifyEHPersonality(
              BB->getParent()->getFunction().getPersonalityFn())) &&
          "SEH does not use catchret!");
@@ -3620,11 +3650,11 @@ static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
     SDValue O1 = Val->getOperand(1);
     bool CanNegateL;
     bool MustBeFirstL;
-    if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
+    if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth + 1))
       return false;
     bool CanNegateR;
     bool MustBeFirstR;
-    if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
+    if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth + 1))
       return false;
 
     if (MustBeFirstL && MustBeFirstR)
@@ -3661,8 +3691,8 @@ static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
 /// \p Negate is true if we want this sub-tree being negated just by changing
 /// SETCC conditions.
 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
-    AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
-    AArch64CC::CondCode Predicate) {
+                                  AArch64CC::CondCode &OutCC, bool Negate,
+                                  SDValue CCOp, AArch64CC::CondCode Predicate) {
   // We're at a tree leaf, produce a conditional comparison operation.
   unsigned Opcode = Val->getOpcode();
   if (Opcode == ISD::SETCC) {
@@ -4049,10 +4079,9 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
     } else {
       SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-      Overflow =
-          DAG.getNode(AArch64ISD::SUBS, DL, VTs,
-                      DAG.getConstant(0, DL, MVT::i64),
-                      UpperBits).getValue(1);
+      Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs,
+                             DAG.getConstant(0, DL, MVT::i64), UpperBits)
+                     .getValue(1);
     }
     break;
   }
@@ -4265,10 +4294,10 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
   }
 
   // built the mask value encoding the expected behavior.
-  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
-                   (!IsData << 3) |     // IsDataCache bit
-                   (Locality << 1) |    // Cache level bits
-                   (unsigned)IsStream;  // Stream bit
+  unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
+                   (!IsData << 3) |    // IsDataCache bit
+                   (Locality << 1) |   // Cache level bits
+                   (unsigned)IsStream; // Stream bit
   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
                      DAG.getTargetConstant(PrfOp, DL, MVT::i32),
                      Op.getOperand(1));
@@ -4702,7 +4731,8 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
     SDValue MinC = DAG.getConstant(
         APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
     SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
-    SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
+    SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC)
+                           : SDValue();
     SDValue MaxC = DAG.getConstant(
         APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
     Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
@@ -4711,7 +4741,8 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
     SDValue MinC = DAG.getConstant(
         APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
     Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
-    Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
+    Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC)
+                   : SDValue();
   }
 
   if (SrcVal2)
@@ -4771,8 +4802,8 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
         APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
     Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
   } else {
-    SDValue MinC = DAG.getConstant(
-        APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
+    SDValue MinC =
+        DAG.getConstant(APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
     Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
   }
 
@@ -4919,7 +4950,7 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
 }
 
 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
-                                            SelectionDAG &DAG) const {
+                                              SelectionDAG &DAG) const {
   if (Op.getValueType().isVector())
     return LowerVectorINT_TO_FP(Op, DAG);
 
@@ -5077,8 +5108,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
   Entry.IsZExt = false;
   Args.push_back(Entry);
 
-  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
-                                        : RTLIB::SINCOS_STRET_F32;
+  RTLIB::Libcall LC =
+      ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
   const char *LibcallName = getLibcallName(LC);
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
@@ -5221,8 +5252,8 @@ static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) {
   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
-    return N0->hasOneUse() && N1->hasOneUse() &&
-      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+    return N0->hasOneUse() && N1->hasOneUse() && isSignExtended(N0, DAG) &&
+           isSignExtended(N1, DAG);
   }
   return false;
 }
@@ -5232,8 +5263,8 @@ static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) {
   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
-    return N0->hasOneUse() && N1->hasOneUse() &&
-      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+    return N0->hasOneUse() && N1->hasOneUse() && isZeroExtended(N0, DAG) &&
+           isZeroExtended(N1, DAG);
   }
   return false;
 }
@@ -5613,8 +5644,8 @@ SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
   TargetLowering::CallLoweringInfo CLI(DAG);
   ArgListTy Args;
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
-      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
-      RetTy, Callee, std::move(Args));
+      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2, RetTy,
+      Callee, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
   return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
@@ -5844,12 +5875,14 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   }
 }
 
-SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
-                                                     SelectionDAG &DAG) const {
+SDValue
+AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                               SelectionDAG &DAG) const {
   unsigned IntNo = Op.getConstantOperandVal(0);
   SDLoc DL(Op);
   switch (IntNo) {
-  default: return SDValue();    // Don't custom lower most intrinsics.
+  default:
+    return SDValue(); // Don't custom lower most intrinsics.
   case Intrinsic::thread_pointer: {
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
@@ -6678,9 +6711,8 @@ SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
 }
 
 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
-static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
-                                        EVT VT, EVT MemVT,
-                                        SelectionDAG &DAG) {
+static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT,
+                                        EVT MemVT, SelectionDAG &DAG) {
   assert(VT.isVector() && "VT should be a vector type");
   assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
 
@@ -6694,19 +6726,19 @@ static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
   //   str  s0, [x0]
 
   SDValue Undef = DAG.getUNDEF(MVT::i16);
-  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
-                                        {Undef, Undef, Undef, Undef});
+  SDValue UndefVec =
+      DAG.getBuildVector(MVT::v4i16, DL, {Undef, Undef, Undef, Undef});
 
-  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
-                                 Value, UndefVec);
+  SDValue TruncExt =
+      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Value, UndefVec);
   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
 
   Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
   SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
                                      Trunc, DAG.getConstant(0, DL, MVT::i64));
 
-  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
-                      ST->getBasePtr(), ST->getMemOperand());
+  return DAG.getStore(ST->getChain(), DL, ExtractTrunc, ST->getBasePtr(),
+                      ST->getMemOperand());
 }
 
 static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
@@ -6744,11 +6776,10 @@ static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
 // Custom lowering for any store, vector or scalar and/or default or with
 // a truncate operations.  Currently only custom lower truncate operation
 // from vector v4i16 to v4i8 or volatile stores of i128.
-SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
-                                          SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc Dl(Op);
   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
-  assert (StoreNode && "Can only custom lower store nodes");
+  assert(StoreNode && "Can only custom lower store nodes");
 
   SDValue Value = StoreNode->getValue();
 
@@ -6808,8 +6839,8 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
     SDValue Base = StoreNode->getBasePtr();
     EVT PtrVT = Base.getValueType();
     for (unsigned i = 0; i < 8; i++) {
-      SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
-                                 Value, DAG.getConstant(i, Dl, MVT::i32));
+      SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, Value,
+                                 DAG.getConstant(i, Dl, MVT::i32));
       SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
                                 DAG.getConstant(i * 8, Dl, PtrVT));
       Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
@@ -6853,8 +6884,7 @@ SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
   return Result;
 }
 
-SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
-                                         SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   assert(LoadNode && "Expected custom lowering of a load node");
@@ -7019,9 +7049,8 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
                             Op.getOperand(0));
   // Generate SUBS & CSEL.
-  SDValue Cmp =
-      DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
-                  Op.getOperand(0), DAG.getConstant(0, DL, VT));
+  SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+                            Op.getOperand(0), DAG.getConstant(0, DL, VT));
   return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
                      DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
                      Cmp.getValue(1));
@@ -7742,7 +7771,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   SmallVector<ISD::OutputArg, 4> Outs;
   GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
                 DAG.getTargetLoweringInfo(), MF.getDataLayout());
-  if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
+  if (any_of(Outs,
+             [](ISD::OutputArg &Out) { return Out.VT.isScalableVector(); }))
     FuncInfo->setIsSVECC(true);
 
   // Assign locations to all of the incoming arguments.
@@ -7803,10 +7833,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       int Size = Ins[i].Flags.getByValSize();
       unsigned NumRegs = (Size + 7) / 8;
 
-      // FIXME: This works on big-endian for composite byvals, which are the common
-      // case. It should also work for fundamental types too.
+      // FIXME: This works on big-endian for composite byvals, which are the
+      // common case. It should also work for fundamental types too.
       unsigned FrameIdx =
-        MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
+          MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
       InVals.push_back(FrameIdxN);
 
@@ -7904,7 +7934,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       unsigned ArgOffset = VA.getLocMemOffset();
       unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
                               ? VA.getLocVT().getSizeInBits()
-                              : VA.getValVT().getSizeInBits()) / 8;
+                              : VA.getValVT().getSizeInBits()) /
+                         8;
 
       uint32_t BEAlign = 0;
       if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
@@ -7967,8 +7998,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     }
 
     if (VA.getLocInfo() == CCValAssign::Indirect) {
-      assert((VA.getValVT().isScalableVT() ||
-              Subtarget->isWindowsArm64EC()) &&
+      assert((VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
              "Indirect arguments should be scalable on most subtargets");
 
       uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
@@ -8044,12 +8074,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 
     // Ensure that the SMSTART happens after the CopyWithChain such that its
     // chain result is used.
-    for (unsigned I=0; I<InVals.size(); ++I) {
+    for (unsigned I = 0; I < InVals.size(); ++I) {
       Register Reg = MF.getRegInfo().createVirtualRegister(
           getRegClassFor(InVals[I].getValueType().getSimpleVT()));
       Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
-      InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
-                                     InVals[I].getValueType());
+      InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg, InVals[I].getValueType());
     }
   }
 
@@ -8081,7 +8110,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       RegParmTypes.push_back(MVT::f128);
       // Compute the set of forwarded registers. The rest are scratch.
       SmallVectorImpl<ForwardedRegister> &Forwards =
-                                       FuncInfo->getForwardedMustTailRegParms();
+          FuncInfo->getForwardedMustTailRegParms();
       CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
                                                CC_AArch64_AAPCS);
 
@@ -8234,7 +8263,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
       GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
       if (GPRSaveSize & 15)
         // The extra size here, if triggered, will always be 8.
-        MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
+        MFI.CreateFixedObject(16 - (GPRSaveSize & 15),
+                              -(int)alignTo(GPRSaveSize, 16), false);
     } else
       GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
 
@@ -8437,9 +8467,9 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
 
     if (!UseVarArgCC) {
       // Get type of the original argument.
-      EVT ActualVT =
-          TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
-                       /*AllowUnknown*/ true);
+      EVT ActualVT = TLI.getValueType(DAG.getDataLayout(),
+                                      CLI.Args[Outs[i].OrigArgIndex].Ty,
+                                      /*AllowUnknown*/ true);
       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
@@ -8536,7 +8566,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     }
   }
 
-  if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
+  if (canGuaranteeTCO(CalleeCC,
+                      getTargetMachine().Options.GuaranteedTailCallOpt))
     return CCMatch;
 
   // Externally-defined functions with weak linkage should not be
@@ -8592,10 +8623,11 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
 
   if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
-    // When we are musttail, additional checks have been done and we can safely ignore this check
-    // At least two cases here: if caller is fastcc then we can't have any
-    // memory arguments (we'd be expected to clean up the stack afterwards). If
-    // caller is C then we could potentially use its argument area.
+    // When we are musttail, additional checks have been done and we can safely
+    // ignore this check At least two cases here: if caller is fastcc then we
+    // can't have any memory arguments (we'd be expected to clean up the stack
+    // afterwards). If caller is C then we could potentially use its argument
+    // area.
 
     // FIXME: for now we take the most conservative of these in both cases:
     // disallow all variadic memory operands.
@@ -9000,10 +9032,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         TPIDR2ObjAddr);
     OptimizationRemarkEmitter ORE(&MF.getFunction());
     ORE.emit([&]() {
-      auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
-                                                   CLI.CB)
-                      : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
-                                                   &MF.getFunction());
+      auto R = CLI.CB
+                   ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA", CLI.CB)
+                   : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
+                                                &MF.getFunction());
       return DescribeCallsite(R) << " sets up a lazy save for ZA";
     });
   } else if (RequiresSaveAllZA) {
@@ -9024,10 +9056,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
     OptimizationRemarkEmitter ORE(&MF.getFunction());
     ORE.emit([&]() {
-      auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
-                                                   CLI.CB)
-                      : OptimizationRemarkAnalysis("sme", "SMETransition",
-                                                   &MF.getFunction());
+      auto R = CLI.CB
+                   ? OptimizationRemarkAnalysis("sme", "SMETransition", CLI.CB)
+                   : OptimizationRemarkAnalysis("sme", "SMETransition",
+                                                &MF.getFunction());
       DescribeCallsite(R) << " requires a streaming mode transition";
       return R;
     });
@@ -9077,7 +9109,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
     for (const auto &F : Forwards) {
       SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
-       RegsToPass.emplace_back(F.PReg, Val);
+      RegsToPass.emplace_back(F.PReg, Val);
     }
   }
 
@@ -9346,8 +9378,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   for (auto &RegToPass : RegsToPass) {
-    Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
-                             RegToPass.second, InGlue);
+    Chain =
+        DAG.getCopyToReg(Chain, DL, RegToPass.first, RegToPass.second, InGlue);
     InGlue = Chain.getValue(1);
   }
 
@@ -9450,8 +9482,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Add argument registers to the end of the list so that they are known live
   // into the call.
   for (auto &RegToPass : RegsToPass)
-    Ops.push_back(DAG.getRegister(RegToPass.first,
-                                  RegToPass.second.getValueType()));
+    Ops.push_back(
+        DAG.getRegister(RegToPass.first, RegToPass.second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
   const uint32_t *Mask;
@@ -9591,8 +9623,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       Register Reg = MF.getRegInfo().createVirtualRegister(
           getRegClassFor(InVals[I].getValueType().getSimpleVT()));
       SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
-      InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
-                                     InVals[I].getValueType());
+      InVals[I] = DAG.getCopyFromReg(X, DL, Reg, InVals[I].getValueType());
     }
   }
 
@@ -9733,7 +9764,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     Glue = Chain.getValue(1);
 
     RetOps.push_back(
-      DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+        DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
   }
 
   const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
@@ -9796,7 +9827,7 @@ SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
                                    N->getOffset(), Flag);
 }
 
-SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
+SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode *N, EVT Ty,
                                              SelectionDAG &DAG,
                                              unsigned Flag) const {
   return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
@@ -10030,8 +10061,7 @@ SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
                                       HiVar,
                                       DAG.getTargetConstant(0, DL, MVT::i32)),
                    0);
-    return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
-                                      LoVar,
+    return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr, LoVar,
                                       DAG.getTargetConstant(0, DL, MVT::i32)),
                    0);
   }
@@ -10243,8 +10273,8 @@ AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
   // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
   // offset into the TLSArray.
   TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
-  SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
-                             DAG.getConstant(3, DL, PtrVT));
+  SDValue Slot =
+      DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, DAG.getConstant(3, DL, PtrVT));
   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
                             MachinePointerInfo());
@@ -11271,8 +11301,8 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
     // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
     // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
     // Both require less instructions than compare and conditional select.
-    if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
-        RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
+    if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal && RHSC &&
+        RHSC->isZero() && CFVal && CFVal->isZero() &&
         LHS.getValueType() == RHS.getValueType()) {
       EVT VT = LHS.getValueType();
       SDValue Shift =
@@ -11386,7 +11416,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
       else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
         FVal = LHS;
     } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
-      assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
+      assert(CTVal && CFVal && "Expected constant operands for CSNEG.");
       // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
       // avoid materializing C.
       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
@@ -11533,8 +11563,7 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
   if (Ty == MVT::aarch64svcount) {
     TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
     FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
-    SDValue Sel =
-        DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
+    SDValue Sel = DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
     return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
   }
 
@@ -11618,8 +11647,7 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
   return getAddr(JT, DAG);
 }
 
-SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
-                                          SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   // Jump table entries as PC relative offsets. No additional tweaking
   // is necessary here. Just get the address of the jump table.
   SDLoc DL(Op);
@@ -11706,7 +11734,7 @@ SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
 }
 
 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
-                                               SelectionDAG &DAG) const {
+                                                 SelectionDAG &DAG) const {
   BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
   const BlockAddress *BA = BAN->getBlockAddress();
 
@@ -11741,7 +11769,7 @@ SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
 }
 
 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
-                                                 SelectionDAG &DAG) const {
+                                                   SelectionDAG &DAG) const {
   AArch64FunctionInfo *FuncInfo =
       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
 
@@ -11884,9 +11912,9 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
   SDLoc DL(Op);
   unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
   unsigned VaListSize =
-      (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
-          ? PtrSize
-          : Subtarget->isTargetILP32() ? 20 : 32;
+      (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) ? PtrSize
+      : Subtarget->isTargetILP32()                                  ? 20
+                                                                    : 32;
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
@@ -11960,7 +11988,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
     SDValue NarrowFP =
         DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
                     DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
-    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
+    SDValue Ops[] = {NarrowFP, WideFP.getValue(1)};
     // Merge the rounded value with the chain output of the load.
     return DAG.getMergeValues(Ops, DL);
   }
@@ -12003,8 +12031,9 @@ SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-Register AArch64TargetLowering::
-getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
+Register
+AArch64TargetLowering::getRegisterByName(const char *RegName, LLT VT,
+                                         const MachineFunction &MF) const {
   Register Reg = MatchRegisterName(RegName);
   if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
     const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
@@ -12126,7 +12155,8 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
   }
 
   LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
-                    << " imm value: "; Imm.dump(););
+                    << " imm value: ";
+             Imm.dump(););
   return IsLegal;
 }
 
@@ -12196,8 +12226,8 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
       // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
       // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
       for (int i = ExtraSteps; i > 0; --i) {
-        SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
-                                   Flags);
+        SDValue Step =
+            DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, Flags);
         Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
       }
@@ -12225,8 +12255,8 @@ SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
       // Newton reciprocal iteration: E * (2 - X * E)
       // AArch64 reciprocal iteration instruction: (2 - M * N)
       for (int i = ExtraSteps; i > 0; --i) {
-        SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
-                                   Estimate, Flags);
+        SDValue Step =
+            DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, Estimate, Flags);
         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
       }
 
@@ -12278,9 +12308,8 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
   if (ConstraintVT.isFloatingPoint())
     return "w";
 
-  if (ConstraintVT.isVector() &&
-     (ConstraintVT.getSizeInBits() == 64 ||
-      ConstraintVT.getSizeInBits() == 128))
+  if (ConstraintVT.isVector() && (ConstraintVT.getSizeInBits() == 64 ||
+                                  ConstraintVT.getSizeInBits() == 128))
     return "w";
 
   return "r";
@@ -12928,10 +12957,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     int WindowScale;
 
     ShuffleSourceInfo(SDValue Vec)
-      : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
+        : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
           ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
 
-    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+    bool operator==(SDValue OtherVec) { return Vec == OtherVec; }
   };
 
   // First gather all vectors used as an immediate source for this BUILD_VECTOR
@@ -13109,8 +13138,9 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
 
       if (!SrcVT.is64BitVector()) {
         LLVM_DEBUG(
-          dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
-                    "for SVE vectors.");
+            dbgs()
+            << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
+               "for SVE vectors.");
         return SDValue();
       }
 
@@ -13161,8 +13191,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
     // segment.
     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
-    int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
-                               VT.getScalarSizeInBits());
+    int BitsDefined =
+        std::min(OrigEltTy.getScalarSizeInBits(), VT.getScalarSizeInBits());
     int LanesDefined = BitsDefined / BitsPerShuffleLane;
 
     // This source is expected to fill ResMultiplier lanes of the final shuffle,
@@ -13181,7 +13211,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     return SDValue();
   }
 
-  SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+  SDValue ShuffleOps[] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
   for (unsigned i = 0; i < Sources.size(); ++i)
     ShuffleOps[i] = Sources[i].ShuffleVec;
 
@@ -13234,7 +13264,8 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
 // v4i32s. This is really a truncate, which we can construct out of (legal)
 // concats and truncate nodes.
-static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
+static SDValue ReconstructTruncateFromBuildVector(SDValue V,
+                                                  SelectionDAG &DAG) {
   if (V.getValueType() != MVT::v16i8)
     return SDValue();
   assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
@@ -13452,8 +13483,8 @@ static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   return true;
 }
 
-static bool isINSMask(ArrayRef<int> M, int NumInputElements,
-                      bool &DstIsLeft, int &Anomaly) {
+static bool isINSMask(ArrayRef<int> M, int NumInputElements, bool &DstIsLeft,
+                      int &Anomaly) {
   if (M.size() != static_cast<size_t>(NumInputElements))
     return false;
 
@@ -13541,9 +13572,9 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
 
 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
 /// the specified operations to build the shuffle. ID is the perfect-shuffle
-//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
-//table entry and LHS/RHS are the immediate inputs for this stage of the
-//shuffle.
+// ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
+// table entry and LHS/RHS are the immediate inputs for this stage of the
+// shuffle.
 static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2,
                                       unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
@@ -14011,9 +14042,9 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   for (unsigned LaneSize : {64U, 32U, 16U}) {
     unsigned Lane = 0;
     if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
-      unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
-                                       : LaneSize == 32 ? AArch64ISD::DUPLANE32
-                                                        : AArch64ISD::DUPLANE16;
+      unsigned Opcode = LaneSize == 64   ? AArch64ISD::DUPLANE64
+                        : LaneSize == 32 ? AArch64ISD::DUPLANE32
+                                         : AArch64ISD::DUPLANE16;
       // Cast V1 to an integer vector with required lane size
       MVT NewEltTy = MVT::getIntegerVT(LaneSize);
       unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
@@ -14230,7 +14261,6 @@ SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
   return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
 }
 
-
 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
                                APInt &UndefBits) {
   EVT VT = BVN->getValueType(0);
@@ -14255,7 +14285,7 @@ static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
 
 // Try 64-bit splatted SIMD immediate.
 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
-                                 const APInt &Bits) {
+                                  const APInt &Bits) {
   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
     EVT VT = Op.getValueType();
@@ -14292,16 +14322,13 @@ static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
       Shift = 0;
-    }
-    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
+    } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
       Shift = 8;
-    }
-    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
+    } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
       Shift = 16;
-    }
-    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
+    } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
       Shift = 24;
     }
@@ -14345,8 +14372,7 @@ static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
       Shift = 0;
-    }
-    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
+    } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
       Shift = 8;
     }
@@ -14385,8 +14411,7 @@ static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
       Shift = 264;
-    }
-    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
+    } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
       Shift = 272;
     }
@@ -14437,9 +14462,8 @@ static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
       MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
-    }
-    else if (isWide &&
-             (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
+    } else if (isWide &&
+               (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
       MovTy = MVT::v2f64;
     }
@@ -14653,16 +14677,15 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
     SDValue NewOp;
 
-    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
-                                    DefBits, &LHS)) ||
-        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
-                                    DefBits, &LHS)))
+    if ((NewOp =
+             tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, DefBits, &LHS)) ||
+        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, DefBits, &LHS)))
       return NewOp;
 
-    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
-                                    UndefBits, &LHS)) ||
-        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
-                                    UndefBits, &LHS)))
+    if ((NewOp =
+             tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, UndefBits, &LHS)) ||
+        (NewOp =
+             tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, UndefBits, &LHS)))
       return NewOp;
   }
 
@@ -14672,12 +14695,11 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
 
 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
 // be truncated to fit element width.
-static SDValue NormalizeBuildVector(SDValue Op,
-                                    SelectionDAG &DAG) {
+static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
-  EVT EltTy= VT.getVectorElementType();
+  EVT EltTy = VT.getVectorElementType();
 
   if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
     return Op;
@@ -15052,8 +15074,9 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
     if (VT.getVectorElementType().isFloatingPoint()) {
       SmallVector<SDValue, 8> Ops;
       EVT EltTy = VT.getVectorElementType();
-      assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
-               EltTy == MVT::f64) && "Unsupported floating-point vector type");
+      assert((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
+              EltTy == MVT::f64) &&
+             "Unsupported floating-point vector type");
       LLVM_DEBUG(
           dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
                     "BITCASTS, and try again\n");
@@ -15685,8 +15708,9 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
-  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
-                                    HasAnyUndefs, ElementBits) ||
+  if (!BVN ||
+      !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
+                            ElementBits) ||
       SplatBitSize > ElementBits)
     return false;
   Cnt = SplatBits.getSExtValue();
@@ -15804,10 +15828,10 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
       return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
                          DAG.getConstant(Cnt, DL, MVT::i32));
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
-                                       MVT::i32),
-                       Op.getOperand(0), Op.getOperand(1));
+    return DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, VT,
+        DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32),
+        Op.getOperand(0), Op.getOperand(1));
   case ISD::SRA:
   case ISD::SRL:
     if (VT.isScalableVector() &&
@@ -15844,10 +15868,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
     // negate the shift amount
     SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
                                    Op.getOperand(1));
-    SDValue NegShiftLeft =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                    DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
-                    NegShift);
+    SDValue NegShiftLeft = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                                       DAG.getConstant(Opc, DL, MVT::i32),
+                                       Op.getOperand(0), NegShift);
     return NegShiftLeft;
   }
 
@@ -16181,8 +16204,8 @@ AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
-                                               PtrVT, 0);
+  SDValue Callee =
+      DAG.getTargetExternalSymbol(Subtarget->getChkStkName(), PtrVT, 0);
 
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
   const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
@@ -16509,7 +16532,7 @@ bool AArch64TargetLowering::shouldReduceLoadWidth(
     // The shift can be combined if it matches the size of the value being
     // loaded (and so reducing the width would make it not match).
     uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
-    uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
+    uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits() / 8;
     if (ShiftAmount == Log2_32(LoadBytes))
       return false;
   }
@@ -16570,10 +16593,10 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
   const DataLayout &DL = F->getDataLayout();
   Type *Ty = User->getOperand(0)->getType();
 
-  return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
-           isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
-           (Options.AllowFPOpFusion == FPOpFusion::Fast ||
-            Options.UnsafeFPMath));
+  return !(
+      isFMAFasterThanFMulAndFAdd(*F, Ty) &&
+      isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
 }
 
 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
@@ -16632,7 +16655,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
     case Instruction::GetElementPtr: {
       gep_type_iterator GTI = gep_type_begin(Instr);
       auto &DL = Ext->getDataLayout();
-      std::advance(GTI, U.getOperandNo()-1);
+      std::advance(GTI, U.getOperandNo() - 1);
       Type *IdxTy = GTI.getIndexedType();
       // This extension will end up with a shift because of the scaling factor.
       // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
@@ -17752,8 +17775,9 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
-                                                  const AddrMode &AMode, Type *Ty,
-                                                  unsigned AS, Instruction *I) const {
+                                                  const AddrMode &AMode,
+                                                  Type *Ty, unsigned AS,
+                                                  Instruction *I) const {
   // AArch64 has five basic addressing modes:
   //  reg
   //  reg + 9-bit signed offset
@@ -17890,9 +17914,8 @@ AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
   // LR is a callee-save register, but we must treat it as clobbered by any call
   // site. Hence we include LR in the scratch registers, which are in turn added
   // as implicit-defs for stackmaps and patchpoints.
-  static const MCPhysReg ScratchRegs[] = {
-    AArch64::X16, AArch64::X17, AArch64::LR, 0
-  };
+  static const MCPhysReg ScratchRegs[] = {AArch64::X16, AArch64::X17,
+                                          AArch64::LR, 0};
   return ScratchRegs;
 }
 
@@ -17901,9 +17924,8 @@ ArrayRef<MCPhysReg> AArch64TargetLowering::getRoundingControlRegisters() const {
   return RCRegs;
 }
 
-bool
-AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
-                                                     CombineLevel Level) const {
+bool AArch64TargetLowering::isDesirableToCommuteWithShift(
+    const SDNode *N, CombineLevel Level) const {
   assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
           N->getOpcode() == ISD::SRL) &&
          "Expected shift op");
@@ -18579,7 +18601,7 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
 }
 
 static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
-  switch(getIntrinsicID(S.getNode())) {
+  switch (getIntrinsicID(S.getNode())) {
   default:
     break;
   case Intrinsic::aarch64_sve_cntb:
@@ -18875,11 +18897,10 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
 
   // Allow the scaling to be folded into the `cnt` instruction by preventing
   // the scaling to be obscured here. This makes it easier to pattern match.
-  if (IsSVECntIntrinsic(N0) ||
-     (N0->getOpcode() == ISD::TRUNCATE &&
-      (IsSVECntIntrinsic(N0->getOperand(0)))))
-       if (ConstValue.sge(1) && ConstValue.sle(16))
-         return SDValue();
+  if (IsSVECntIntrinsic(N0) || (N0->getOpcode() == ISD::TRUNCATE &&
+                                (IsSVECntIntrinsic(N0->getOperand(0)))))
+    if (ConstValue.sge(1) && ConstValue.sle(16))
+      return SDValue();
 
   // Multiplication of a power of two plus/minus one can be done more
   // cheaply as shift+add/sub. For now, this is true unilaterally. If
@@ -18897,8 +18918,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
   if (TrailingZeroes) {
     // Conservatively do not lower to shift+add+shift if the mul might be
     // folded into smul or umul.
-    if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
-                            isZeroExtended(N0, DAG)))
+    if (N0->hasOneUse() && (isSignExtended(N0, DAG) || isZeroExtended(N0, DAG)))
       return SDValue();
     // Conservatively do not lower to shift+add+shift if the mul might be
     // folded into madd or msub.
@@ -19201,8 +19221,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
     // to use the new Chain.
     DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
 
-    unsigned Opcode =
-        (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
+    unsigned Opcode = (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF
+                                                          : AArch64ISD::UITOF;
     return DAG.getNode(Opcode, SDLoc(N), VT, Load);
   }
 
@@ -19569,8 +19589,8 @@ static SDValue performSVEAndCombine(SDNode *N,
     Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
                       DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
 
-    SDValue And = DAG.getNode(ISD::AND, DL,
-                              UnpkOp->getValueType(0), UnpkOp, Dup);
+    SDValue And =
+        DAG.getNode(ISD::AND, DL, UnpkOp->getValueType(0), UnpkOp, Dup);
 
     return DAG.getNode(Opc, DL, N->getValueType(0), And);
   }
@@ -19680,7 +19700,7 @@ static SDValue performANDCombine(SDNode *N,
   if (SDValue R = performANDORCSELCombine(N, DAG))
     return R;
 
-  if (SDValue R = performANDSETCCCombine(N,DCI))
+  if (SDValue R = performANDSETCCCombine(N, DCI))
     return R;
 
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
@@ -19955,8 +19975,7 @@ performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
       // with the strict_fadd, but we also need uses of the chain output of the
       // original strict_fadd to use the chain output of the new strict_fadd as
       // otherwise it may not be deleted.
-      SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
-                                {VT, MVT::Other},
+      SDValue Ret = DAG.getNode(N0->getOpcode(), DL, {VT, MVT::Other},
                                 {N0->getOperand(0), Extract1, Extract2});
       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
       DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
@@ -20489,11 +20508,11 @@ static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
 }
 
 // Returns true if Op is setcc or zext of setcc.
-static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
+static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info) {
   if (isSetCC(Op, Info))
     return true;
   return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
-    isSetCC(Op->getOperand(0), Info));
+          isSetCC(Op->getOperand(0), Info));
 }
 
 // The folding we want to perform is:
@@ -20931,9 +20950,9 @@ static SDValue performBuildVectorCombine(SDNode *N,
         SDValue HalfToSingle =
             DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
         SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
-        SDValue Extract = DAG.getNode(
-            ISD::EXTRACT_SUBVECTOR, DL, VT.changeVectorElementType(MVT::f32),
-            HalfToSingle, SubvectorIdx);
+        SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
+                                      VT.changeVectorElementType(MVT::f32),
+                                      HalfToSingle, SubvectorIdx);
         return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
       }
     }
@@ -20970,7 +20989,8 @@ static SDValue performBuildVectorCombine(SDNode *N,
     if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
       return SDValue();
 
-    SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
+    SDValue SubvectorIdx =
+        DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
 
     SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
@@ -21263,8 +21283,10 @@ static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) {
     // are lowered. Note that this only comes up because we do not always visit
     // operands before uses. After that is fixed this can be removed and in the
     // meantime this is fairly specific to the lowering we expect from IR.
-    // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
-    //   t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
+    // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44,
+    // t45
+    //   t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42,
+    //   t43
     //     t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
     //       t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
     //       t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
@@ -21765,7 +21787,7 @@ static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
     case Intrinsic::aarch64_sve_cmphs_wide:
     case Intrinsic::aarch64_sve_cmphi_wide:
     case Intrinsic::aarch64_sve_cmplo_wide:
-    case Intrinsic::aarch64_sve_cmpls_wide:  {
+    case Intrinsic::aarch64_sve_cmpls_wide: {
       if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
         uint64_t ImmVal = CN->getZExtValue();
         if (ImmVal <= 127)
@@ -22632,9 +22654,8 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
   uint64_t BaseOffset = 0;
 
   const MachinePointerInfo &PtrInfo = St.getPointerInfo();
-  SDValue NewST1 =
-      DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
-                   OrigAlignment, St.getMemOperand()->getFlags());
+  SDValue NewST1 = DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
+                                OrigAlignment, St.getMemOperand()->getFlags());
 
   // As this in ISel, we will not merge this add which may degrade results.
   if (BasePtr->getOpcode() == ISD::ADD &&
@@ -22699,10 +22720,10 @@ static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
     ContainerVT = getSVEContainerType(ContainerVT);
 
   SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
-  SDValue Ops[] = { N->getOperand(0), // Chain
-                    N->getOperand(2), // Pg
-                    N->getOperand(3), // Base
-                    DAG.getValueType(VT) };
+  SDValue Ops[] = {N->getOperand(0), // Chain
+                   N->getOperand(2), // Pg
+                   N->getOperand(3), // Base
+                   DAG.getValueType(VT)};
 
   SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
   SDValue LoadChain = SDValue(Load.getNode(), 1);
@@ -22710,7 +22731,7 @@ static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
   if (ContainerVT.isInteger() && (VT != ContainerVT))
     Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
 
-  return DAG.getMergeValues({ Load, LoadChain }, DL);
+  return DAG.getMergeValues({Load, LoadChain}, DL);
 }
 
 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
@@ -22724,16 +22745,16 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
 
   auto *MINode = cast<MemIntrinsicSDNode>(N);
   SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
-  SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
-                                MINode->getOperand(3), DAG.getUNDEF(PtrTy),
-                                MINode->getOperand(2), PassThru,
-                                MINode->getMemoryVT(), MINode->getMemOperand(),
-                                ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
+  SDValue L =
+      DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(), MINode->getOperand(3),
+                        DAG.getUNDEF(PtrTy), MINode->getOperand(2), PassThru,
+                        MINode->getMemoryVT(), MINode->getMemOperand(),
+                        ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
 
-   if (VT.isFloatingPoint()) {
-     SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
-     return DAG.getMergeValues(Ops, DL);
-   }
+  if (VT.isFloatingPoint()) {
+    SDValue Ops[] = {DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1)};
+    return DAG.getMergeValues(Ops, DL);
+  }
 
   return L;
 }
@@ -22776,12 +22797,11 @@ static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
   else
     SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
 
-  SDValue Ops[] = { N->getOperand(0), // Chain
-                    SrcNew,
-                    N->getOperand(4), // Base
-                    N->getOperand(3), // Pg
-                    InputVT
-                  };
+  SDValue Ops[] = {N->getOperand(0), // Chain
+                   SrcNew,
+                   N->getOperand(4), // Base
+                   N->getOperand(3), // Pg
+                   InputVT};
 
   return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
 }
@@ -22933,7 +22953,7 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
   }
   // Check that all vector element locations were inserted to.
   if (IndexNotInserted.any())
-      return SDValue();
+    return SDValue();
 
   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
 }
@@ -23537,19 +23557,19 @@ static SDValue performPostLD1Combine(SDNode *N,
       continue;
 
     SmallVector<SDValue, 8> Ops;
-    Ops.push_back(LD->getOperand(0));  // Chain
+    Ops.push_back(LD->getOperand(0)); // Chain
     if (IsLaneOp) {
-      Ops.push_back(Vector);           // The vector to be inserted
-      Ops.push_back(Lane);             // The lane to be inserted in the vector
+      Ops.push_back(Vector); // The vector to be inserted
+      Ops.push_back(Lane);   // The lane to be inserted in the vector
     }
     Ops.push_back(Addr);
     Ops.push_back(Inc);
 
-    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
+    EVT Tys[3] = {VT, MVT::i64, MVT::Other};
     SDVTList SDTys = DAG.getVTList(Tys);
-    unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
-                                           MemVT,
+    unsigned NewOp =
+        IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, MemVT,
                                            LoadSDN->getMemOperand());
 
     // Update the uses.
@@ -23558,8 +23578,8 @@ static SDValue performPostLD1Combine(SDNode *N,
         SDValue(UpdN.getNode(), 2) // Chain
     };
     DCI.CombineTo(LD, NewResults);
-    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
-    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
+    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));    // Dup/Inserted Result
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
 
     break;
   }
@@ -24368,49 +24388,110 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
     unsigned NumVecs = 0;
     unsigned IntNo = N->getConstantOperandVal(1);
     switch (IntNo) {
-    default: llvm_unreachable("unexpected intrinsic for Neon base update");
-    case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
-      NumVecs = 2; break;
-    case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
-      NumVecs = 3; break;
-    case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
-      NumVecs = 4; break;
-    case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
-      NumVecs = 2; IsStore = true; break;
-    case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
-      NumVecs = 3; IsStore = true; break;
-    case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
-      NumVecs = 4; IsStore = true; break;
-    case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
-      NumVecs = 2; break;
-    case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
-      NumVecs = 3; break;
-    case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
-      NumVecs = 4; break;
-    case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
-      NumVecs = 2; IsStore = true; break;
-    case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
-      NumVecs = 3; IsStore = true; break;
-    case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
-      NumVecs = 4; IsStore = true; break;
-    case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
-      NumVecs = 2; IsDupOp = true; break;
-    case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
-      NumVecs = 3; IsDupOp = true; break;
-    case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
-      NumVecs = 4; IsDupOp = true; break;
-    case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
-      NumVecs = 2; IsLaneOp = true; break;
-    case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
-      NumVecs = 3; IsLaneOp = true; break;
-    case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
-      NumVecs = 4; IsLaneOp = true; break;
-    case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
-      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
-    case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
-      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
-    case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
-      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
+    default:
+      llvm_unreachable("unexpected intrinsic for Neon base update");
+    case Intrinsic::aarch64_neon_ld2:
+      NewOpc = AArch64ISD::LD2post;
+      NumVecs = 2;
+      break;
+    case Intrinsic::aarch64_neon_ld3:
+      NewOpc = AArch64ISD::LD3post;
+      NumVecs = 3;
+      break;
+    case Intrinsic::aarch64_neon_ld4:
+      NewOpc = AArch64ISD::LD4post;
+      NumVecs = 4;
+      break;
+    case Intrinsic::aarch64_neon_st2:
+      NewOpc = AArch64ISD::ST2post;
+      NumVecs = 2;
+      IsStore = true;
+      break;
+    case Intrinsic::aarch64_neon_st3:
+      NewOpc = AArch64ISD::ST3post;
+      NumVecs = 3;
+      IsStore = true;
+      break;
+    case Intrinsic::aarch64_neon_st4:
+      NewOpc = AArch64ISD::ST4post;
+      NumVecs = 4;
+      IsStore = true;
+      break;
+    case Intrinsic::aarch64_neon_ld1x2:
+      NewOpc = AArch64ISD::LD1x2post;
+      NumVecs = 2;
+      break;
+    case Intrinsic::aarch64_neon_ld1x3:
+      NewOpc = AArch64ISD::LD1x3post;
+      NumVecs = 3;
+      break;
+    case Intrinsic::aarch64_neon_ld1x4:
+      NewOpc = AArch64ISD::LD1x4post;
+      NumVecs = 4;
+      break;
+    case Intrinsic::aarch64_neon_st1x2:
+      NewOpc = AArch64ISD::ST1x2post;
+      NumVecs = 2;
+      IsStore = true;
+      break;
+    case Intrinsic::aarch64_neon_st1x3:
+      NewOpc = AArch64ISD::ST1x3post;
+      NumVecs = 3;
+      IsStore = true;
+      break;
+    case Intrinsic::aarch64_neon_st1x4:
+      NewOpc = AArch64ISD::ST1x4post;
+      NumVecs = 4;
+      IsStore = true;
+      break;
+    case Intrinsic::aarch64_neon_ld2r:
+      NewOpc = AArch64ISD::LD2DUPpost;
+      NumVecs = 2;
+      IsDupOp = true;
+      break;
+    case Intrinsic::aarch64_neon_ld3r:
+      NewOpc = AArch64ISD::LD3DUPpost;
+      NumVecs = 3;
+      IsDupOp = true;
+      break;
+    case Intrinsic::aarch64_neon_ld4r:
+      NewOpc = AArch64ISD::LD4DUPpost;
+      NumVecs = 4;
+      IsDupOp = true;
+      break;
+    case Intrinsic::aarch64_neon_ld2lane:
+      NewOpc = AArch64ISD::LD2LANEpost;
+      NumVecs = 2;
+      IsLaneOp = true;
+      break;
+    case Intrinsic::aarch64_neon_ld3lane:
+      NewOpc = AArch64ISD::LD3LANEpost;
+      NumVecs = 3;
+      IsLaneOp = true;
+      break;
+    case Intrinsic::aarch64_neon_ld4lane:
+      NewOpc = AArch64ISD::LD4LANEpost;
+      NumVecs = 4;
+      IsLaneOp = true;
+      break;
+    case Intrinsic::aarch64_neon_st2lane:
+      NewOpc = AArch64ISD::ST2LANEpost;
+      NumVecs = 2;
+      IsStore = true;
+      IsLaneOp = true;
+      break;
+    case Intrinsic::aarch64_neon_st3lane:
+      NewOpc = AArch64ISD::ST3LANEpost;
+      NumVecs = 3;
+      IsStore = true;
+      IsLaneOp = true;
+      break;
+    case Intrinsic::aarch64_neon_st4lane:
+      NewOpc = AArch64ISD::ST4LANEpost;
+      NumVecs = 4;
+      IsStore = true;
+      IsLaneOp = true;
+      break;
     }
 
     EVT VecTy;
@@ -24445,14 +24526,14 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
     unsigned n;
     for (n = 0; n < NumResultVecs; ++n)
       Tys[n] = VecTy;
-    Tys[n++] = MVT::i64;  // Type of write back register
-    Tys[n] = MVT::Other;  // Type of the chain
+    Tys[n++] = MVT::i64; // Type of write back register
+    Tys[n] = MVT::Other; // Type of the chain
     SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
 
     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
-                                           MemInt->getMemoryVT(),
-                                           MemInt->getMemOperand());
+    SDValue UpdN =
+        DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
+                                MemInt->getMemoryVT(), MemInt->getMemOperand());
 
     // Update the uses.
     std::vector<SDValue> NewResults;
@@ -24470,16 +24551,16 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
 
 // Checks to see if the value is the prescribed width and returns information
 // about its extension mode.
-static
-bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
+static bool checkValueWidth(SDValue V, unsigned width,
+                            ISD::LoadExtType &ExtType) {
   ExtType = ISD::NON_EXTLOAD;
-  switch(V.getNode()->getOpcode()) {
+  switch (V.getNode()->getOpcode()) {
   default:
     return false;
   case ISD::LOAD: {
     LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
-    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
-       || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
+    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) ||
+        (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
       ExtType = LoadNode->getExtensionType();
       return true;
     }
@@ -24487,8 +24568,8 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
   }
   case ISD::AssertSext: {
     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
-    if ((TypeNode->getVT() == MVT::i8 && width == 8)
-       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+    if ((TypeNode->getVT() == MVT::i8 && width == 8) ||
+        (TypeNode->getVT() == MVT::i16 && width == 16)) {
       ExtType = ISD::SEXTLOAD;
       return true;
     }
@@ -24496,8 +24577,8 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
   }
   case ISD::AssertZext: {
     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
-    if ((TypeNode->getVT() == MVT::i8 && width == 8)
-       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+    if ((TypeNode->getVT() == MVT::i8 && width == 8) ||
+        (TypeNode->getVT() == MVT::i16 && width == 16)) {
       ExtType = ISD::ZEXTLOAD;
       return true;
     }
@@ -24588,9 +24669,9 @@ static bool isEquivalentMaskless(unsigned CC, unsigned width,
   // the whole range we can just adjust the input and avoid writing equations
   // for sign extended inputs.
   if (ExtType == ISD::SEXTLOAD)
-    AddConstant -= (1 << (width-1));
+    AddConstant -= (1 << (width - 1));
 
-  switch(CC) {
+  switch (CC) {
   case AArch64CC::LE:
   case AArch64CC::GT:
     if ((AddConstant == 0) ||
@@ -24601,22 +24682,20 @@ static bool isEquivalentMaskless(unsigned CC, unsigned width,
     break;
   case AArch64CC::LT:
   case AArch64CC::GE:
-    if ((AddConstant == 0) ||
-        (AddConstant >= 0 && CompConstant <= 0) ||
+    if ((AddConstant == 0) || (AddConstant >= 0 && CompConstant <= 0) ||
         (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
       return true;
     break;
   case AArch64CC::HI:
   case AArch64CC::LS:
     if ((AddConstant >= 0 && CompConstant < 0) ||
-       (AddConstant <= 0 && CompConstant >= -1 &&
-        CompConstant < AddConstant + MaxUInt))
+        (AddConstant <= 0 && CompConstant >= -1 &&
+         CompConstant < AddConstant + MaxUInt))
       return true;
-   break;
+    break;
   case AArch64CC::PL:
   case AArch64CC::MI:
-    if ((AddConstant == 0) ||
-        (AddConstant > 0 && CompConstant <= 0) ||
+    if ((AddConstant == 0) || (AddConstant > 0 && CompConstant <= 0) ||
         (AddConstant < 0 && CompConstant <= AddConstant))
       return true;
     break;
@@ -24697,11 +24776,10 @@ static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode,
   return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
 }
 
-static
-SDValue performCONDCombine(SDNode *N,
-                           TargetLowering::DAGCombinerInfo &DCI,
-                           SelectionDAG &DAG, unsigned CCIndex,
-                           unsigned CmpIndex) {
+static SDValue performCONDCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  SelectionDAG &DAG, unsigned CCIndex,
+                                  unsigned CmpIndex) {
   unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
   SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
   unsigned CondOpcode = SubsNode->getOpcode();
@@ -24756,19 +24834,20 @@ SDValue performCONDCombine(SDNode *N,
 
   if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
       !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
-      !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
+      !checkValueWidth(AddInputValue1, MaskBits, ExtType))
     return SDValue();
 
-  if(!isEquivalentMaskless(CC, MaskBits, ExtType,
-                cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
-                cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
+  if (!isEquivalentMaskless(
+          CC, MaskBits, ExtType,
+          cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
+          cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
     return SDValue();
 
   // The AND is not necessary, remove it.
 
-  SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
-                               SubsNode->getValueType(1));
-  SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
+  SDVTList VTs =
+      DAG.getVTList(SubsNode->getValueType(0), SubsNode->getValueType(1));
+  SDValue Ops[] = {AddValue, SubsNode->getOperand(1)};
 
   SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
   DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
@@ -25147,7 +25226,7 @@ static SDValue performCSELCombine(SDNode *N,
   // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
   // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
   if (SDValue Folded = foldCSELofCTTZ(N, DAG))
-		return Folded;
+    return Folded;
 
   // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
   // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
@@ -25303,8 +25382,8 @@ static SDValue performSETCCCombine(SDNode *N,
     if (FromVT.isFixedLengthVector() &&
         FromVT.getVectorElementType() == MVT::i1) {
       bool IsNull = isNullConstant(RHS);
-      LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
-                        DL, MVT::i1, LHS->getOperand(0));
+      LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND, DL,
+                        MVT::i1, LHS->getOperand(0));
       LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
                         LHS);
       return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
@@ -25659,8 +25738,7 @@ static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
   SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
                        N0.getOperand(0), N0.getOperand(1),
                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
-  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
-                     IfTrue, IfFalse);
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, IfTrue, IfFalse);
 }
 
 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
@@ -25717,17 +25795,15 @@ static SDValue performSelectCombine(SDNode *N,
   // First perform a vector comparison, where lane 0 is the one we're interested
   // in.
   SDLoc DL(N0);
-  SDValue LHS =
-      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
-  SDValue RHS =
-      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
+  SDValue LHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
+  SDValue RHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
   SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
 
   // Now duplicate the comparison mask we want across all other lanes.
   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
-  Mask = DAG.getNode(ISD::BITCAST, DL,
-                     ResVT.changeVectorElementTypeToInteger(), Mask);
+  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
+                     Mask);
 
   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
 }
@@ -25850,8 +25926,7 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
 
   const GlobalValue *GV = GN->getGlobal();
   Type *T = GV->getValueType();
-  if (!T->isSized() ||
-      Offset > GV->getDataLayout().getTypeAllocSize(T))
+  if (!T->isSized() || Offset > GV->getDataLayout().getTypeAllocSize(T))
     return SDValue();
 
   SDLoc DL(GN);
@@ -26147,8 +26222,8 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   // Sign extend of an unsigned unpack -> signed unpack
   if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
 
-    unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
-                                               : AArch64ISD::SUNPKLO;
+    unsigned SOpc =
+        Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI : AArch64ISD::SUNPKLO;
 
     // Push the sign extend to the operand of the unpack
     // This is necessary where, for example, the operand of the unpack
@@ -27453,12 +27528,12 @@ static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
   SDLoc DL(V.getNode());
   auto [VLo, VHi] = DAG.SplitScalar(V, DL, MVT::i64, MVT::i64);
   if (DAG.getDataLayout().isBigEndian())
-    std::swap (VLo, VHi);
+    std::swap(VLo, VHi);
   SDValue RegClass =
       DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, DL, MVT::i32);
   SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, DL, MVT::i32);
   SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, DL, MVT::i32);
-  const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
+  const SDValue Ops[] = {RegClass, VLo, SubReg0, VHi, SubReg1};
   return SDValue(
       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops), 0);
 }
@@ -27477,8 +27552,8 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
     SDValue Ops[] = {
         createGPRPairNode(DAG, N->getOperand(2)), // Compare value
         createGPRPairNode(DAG, N->getOperand(3)), // Store value
-        N->getOperand(1), // Ptr
-        N->getOperand(0), // Chain in
+        N->getOperand(1),                         // Ptr
+        N->getOperand(0),                         // Chain in
     };
 
     unsigned Opcode;
@@ -27797,8 +27872,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
 
     if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
         LoadNode->getMemoryVT() != MVT::i128) {
-      // Non-volatile or atomic loads are optimized later in AArch64's load/store
-      // optimizer.
+      // Non-volatile or atomic loads are optimized later in AArch64's
+      // load/store optimizer.
       return;
     }
 
@@ -27850,8 +27925,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
              "custom lowering for unexpected type");
       SDLoc DL(N);
       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
-      auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
-                           N->getOperand(1), Op2, N->getOperand(3));
+      auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32, N->getOperand(1),
+                           Op2, N->getOperand(3));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
@@ -27860,8 +27935,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
              "custom lowering for unexpected type");
       SDLoc DL(N);
       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
-      auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
-                           N->getOperand(1), Op2, N->getOperand(3));
+      auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32, N->getOperand(1),
+                           Op2, N->getOperand(3));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
@@ -27869,8 +27944,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
       assert((VT == MVT::i8 || VT == MVT::i16) &&
              "custom lowering for unexpected type");
       SDLoc DL(N);
-      auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
-                           N->getOperand(1), N->getOperand(2));
+      auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32, N->getOperand(1),
+                           N->getOperand(2));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
@@ -27878,8 +27953,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
       assert((VT == MVT::i8 || VT == MVT::i16) &&
              "custom lowering for unexpected type");
       SDLoc DL(N);
-      auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
-                           N->getOperand(1), N->getOperand(2));
+      auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32, N->getOperand(1),
+                           N->getOperand(2));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
@@ -28222,7 +28297,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
     return Builder.CreateBitCast(Or, ValueTy);
   }
 
-  Type *Tys[] = { Addr->getType() };
+  Type *Tys[] = {Addr->getType()};
   Intrinsic::ID Int =
       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
 
@@ -28267,11 +28342,12 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
 
   Intrinsic::ID Int =
       IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
-  Type *Tys[] = { Addr->getType() };
+  Type *Tys[] = {Addr->getType()};
   Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int, Tys);
 
   const DataLayout &DL = M->getDataLayout();
-  IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
+  IntegerType *IntValTy =
+      Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
   Val = Builder.CreateBitCast(Val, IntValTy);
 
   CallInst *CI = Builder.CreateCall(
@@ -28401,7 +28477,7 @@ bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
   // may be beneficial to sink in other cases, but we would have to check that
   // the cmp would not get folded into the br to form a cbz for these to be
   // beneficial.
-  ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+  ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
   if (!Mask)
     return false;
   return Mask->getValue().isPowerOf2();
@@ -28462,9 +28538,9 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
     // nounwind. If we want to generalize this later, we may need to emit
     // CFI pseudo-instructions.
-    assert(Entry->getParent()->getFunction().hasFnAttribute(
-               Attribute::NoUnwind) &&
-           "Function should be nounwind in insertCopiesSplitCSR!");
+    assert(
+        Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+        "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
@@ -29202,9 +29278,9 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
 
   SmallVector<SDValue, 4> Operands = {Pg};
   for (const SDValue &V : Op->op_values()) {
-    assert((!V.getValueType().isVector() ||
-            V.getValueType().isScalableVector()) &&
-           "Only scalable vectors are supported!");
+    assert(
+        (!V.getValueType().isVector() || V.getValueType().isScalableVector()) &&
+        "Only scalable vectors are supported!");
     Operands.push_back(V);
   }
 
@@ -29246,8 +29322,9 @@ SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
   return convertFromScalableVector(DAG, VT, ScalableRes);
 }
 
-SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
-    SelectionDAG &DAG) const {
+SDValue
+AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
+                                               SelectionDAG &DAG) const {
   SDLoc DL(ScalarOp);
   SDValue AccOp = ScalarOp.getOperand(0);
   SDValue VecOp = ScalarOp.getOperand(1);
@@ -29268,14 +29345,15 @@ SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
                       DAG.getUNDEF(ContainerVT), AccOp, Zero);
 
   // Perform reduction.
-  SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
-                            Pg, AccOp, VecOp);
+  SDValue Rdx =
+      DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT, Pg, AccOp, VecOp);
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
 }
 
-SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
-                                                       SelectionDAG &DAG) const {
+SDValue
+AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
+                                               SelectionDAG &DAG) const {
   SDLoc DL(ReduceOp);
   SDValue Op = ReduceOp.getOperand(0);
   EVT OpVT = Op.getValueType();
@@ -29346,16 +29424,16 @@ SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
   }
 
   // UADDV always returns an i64 result.
-  EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
-                                                   SrcVT.getVectorElementType();
+  EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64
+                                                 : SrcVT.getVectorElementType();
   EVT RdxVT = SrcVT;
   if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
     RdxVT = getPackedSVEVectorVT(ResVT);
 
   SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
   SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
-  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
-                            Rdx, DAG.getConstant(0, DL, MVT::i64));
+  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx,
+                            DAG.getConstant(0, DL, MVT::i64));
 
   // The VEC_REDUCE nodes expect an element size result.
   if (ResVT != ScalarOp.getValueType())
@@ -29364,9 +29442,8 @@ SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
   return Res;
 }
 
-SDValue
-AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
-    SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
 
@@ -29383,8 +29460,7 @@ AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
   Mask = DAG.getNode(ISD::TRUNCATE, DL,
                      MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
 
-  auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
-                                Mask, Op1, Op2);
+  auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT, Mask, Op1, Op2);
 
   return convertFromScalableVector(DAG, VT, ScalableRes);
 }
@@ -29483,16 +29559,16 @@ AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
   SDValue Pg = getPredicateForVector(DAG, DL, VT);
   EVT SrcVT = Val.getValueType();
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-  EVT ExtendVT = ContainerVT.changeVectorElementType(
-      SrcVT.getVectorElementType());
+  EVT ExtendVT =
+      ContainerVT.changeVectorElementType(SrcVT.getVectorElementType());
 
   Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
   Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
 
   Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
   Val = getSVESafeBitCast(ExtendVT, Val, DAG);
-  Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
-                    Pg, Val, DAG.getUNDEF(ContainerVT));
+  Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, Pg,
+                    Val, DAG.getUNDEF(ContainerVT));
 
   return convertFromScalableVector(DAG, VT, Val);
 }
@@ -29507,8 +29583,8 @@ AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
   SDValue Val = Op.getOperand(0);
   EVT SrcVT = Val.getValueType();
   EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
-  EVT RoundVT = ContainerSrcVT.changeVectorElementType(
-      VT.getVectorElementType());
+  EVT RoundVT =
+      ContainerSrcVT.changeVectorElementType(VT.getVectorElementType());
   SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
 
   Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
@@ -29822,7 +29898,7 @@ AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
 
   if (VT.bitsGT(SrcVT)) {
     EVT CvtVT = ContainerDstVT.changeVectorElementType(
-      ContainerSrcVT.getVectorElementType());
+        ContainerSrcVT.getVectorElementType());
     SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
 
     Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
@@ -30145,8 +30221,8 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
   SDLoc DL(Op);
   EVT InVT = Op.getValueType();
 
-  assert(VT.isScalableVector() && isTypeLegal(VT) &&
-         InVT.isScalableVector() && isTypeLegal(InVT) &&
+  assert(VT.isScalableVector() && isTypeLegal(VT) && InVT.isScalableVector() &&
+         isTypeLegal(InVT) &&
          "Only expect to cast between legal scalable vector types!");
   assert(VT.getVectorElementType() != MVT::i1 &&
          InVT.getVectorElementType() != MVT::i1 &&
@@ -30285,9 +30361,12 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
 }
 
 bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
-  return Op.getOpcode() == AArch64ISD::DUP ||
-         Op.getOpcode() == AArch64ISD::MOVI ||
-         (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+  unsigned Opc = Op.getOpcode();
+  return Opc == AArch64ISD::DUP || Opc == AArch64ISD::MOVI ||
+         Opc == AArch64ISD::MOVIshift || Opc == AArch64ISD::MOVIedit ||
+         Opc == AArch64ISD::MOVImsl || Opc == AArch64ISD::MVNIshift ||
+         Opc == AArch64ISD::MVNImsl ||
+         (Opc == ISD::EXTRACT_SUBVECTOR &&
           Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
          TargetLowering::isTargetCanonicalConstantNode(Op);
 }
@@ -30399,7 +30478,6 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
                               Intrinsic::aarch64_neon_vcmla_rot180,
                               Intrinsic::aarch64_neon_vcmla_rot270};
 
-
     return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
                              {Accumulator, InputA, InputB});
   }
diff --git a/llvm/test/CodeGen/AArch64/movi-custom.ll b/llvm/test/CodeGen/AArch64/movi-custom.ll
new file mode 100644
index 0000000000000..14c8fd5c994d9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/movi-custom.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple=aarch64-linux-gnu < %s -o - | FileCheck %s
+
+; Test 1: AArch64ISD::MOVIedit
+define <16 x i8> @test_movi_edit() {
+; CHECK-LABEL: test_movi_edit:
+; CHECK:       movi v0.16b, #63
+  ret <16 x i8> <i8 63, i8 63, i8 63, i8 63, i8 63, i8 63, i8 63, i8 63,
+                 i8 63, i8 63, i8 63, i8 63, i8 63, i8 63, i8 63, i8 63>
+}
+
+; Test 2: AArch64ISD::MOVImsl
+define <4 x i32> @test_movi_msl() {
+; CHECK-LABEL: test_movi_msl:
+; CHECK:       movi v0.4s, #64
+  ret <4 x i32> <i32 64, i32 64, i32 64, i32 64>
+}
+
+; Test 3: AArch64ISD::MVNIshift
+define <4 x i32> @test_mvni_shift() {
+; CHECK-LABEL: test_mvni_shift:
+; CHECK:       movi v0.2d, #0xffff00ffffff00ff
+  ret <4 x i32> <i32 -65281, i32 -65281, i32 -65281, i32 -65281>
+}
+
+; Test 4: AArch64ISD::MVNImsl
+define <4 x i32> @test_mvnimsl() {
+; CHECK-LABEL: test_mvnimsl:
+; CHECK:       mvni v0.4s, #64
+  ret <4 x i32> <i32 -65, i32 -65, i32 -65, i32 -65>
+}