[llvm] [AArch64] Optimized generated assembly for bool to svbool_t conversions (PR #83001)

Wed Feb 28 03:58:56 PST 2024

https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/83001

>From 3c4270c28d42ac798c3674bc51f16ca30ae0320d Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Mon, 26 Feb 2024 13:03:52 +0000
Subject: [PATCH 1/4] [AArch64] Optimized generated assembly for bool to
 svbool_t conversions

The original assembly was generating `AND(WHILELO, SPLAT 1)` pattern when only `WHILELO` was necessary
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  1 +
 .../AArch64/sve-intrinsics-reinterpret.ll     | 42 ++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a3b7e3128ac1a4..dba3a787734721 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -276,6 +276,7 @@ static bool isZeroingInactiveLanes(SDValue Op) {
     if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
       return true;
     return false;
+  case ISD::SPLAT_VECTOR:
   case AArch64ISD::PTRUE:
   case AArch64ISD::SETCC_MERGE_ZERO:
     return true;
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
index 82bf756f822898..c7c102f5d567d9 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
 
@@ -150,6 +150,46 @@ define <vscale x 16 x i1> @chained_reinterpret() {
   ret <vscale x 16 x i1> %out
 }
 
+define <vscale x 16 x i1> @reinterpret_scalar_bool_h(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @reinterpret_scalar_bool_s(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @reinterpret_scalar_bool_q(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_q:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+
 declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
 declare <vscale x 8 x i1> @llvm.aarch64.sve.cmpgt.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)

>From 8724b327c830ec9caa6ab75f6a21f1da49fcb641 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Wed, 28 Feb 2024 11:11:19 +0000
Subject: [PATCH 2/4] Cleaned up default behaviour of isZeroingInactiveLanes
 and added seeing through bitcasts

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dba3a787734721..5b1be2705d3740 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -269,13 +269,14 @@ static bool isMergePassthruOpcode(unsigned Opc) {
 
 // Returns true if inactive lanes are known to be zeroed by construction.
 static bool isZeroingInactiveLanes(SDValue Op) {
+  // Skip bitcasts nodes
+  while (Op->getOpcode() == ISD::BITCAST)
+    Op = Op->getOperand(0);
+    
   switch (Op.getOpcode()) {
   default:
-    // We guarantee i1 splat_vectors to zero the other lanes by
-    // implementing it with ptrue and possibly a punpklo for nxv1i1.
-    if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
-      return true;
     return false;
+  // We guarantee i1 splat_vectors to zero the other lanes
   case ISD::SPLAT_VECTOR:
   case AArch64ISD::PTRUE:
   case AArch64ISD::SETCC_MERGE_ZERO:

>From 7d47f2b74bddee18b12aafa124c7a78e36643f38 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Wed, 28 Feb 2024 11:50:18 +0000
Subject: [PATCH 3/4] test

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 1343 +++++++++--------
 1 file changed, 715 insertions(+), 628 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5b1be2705d3740..90f68a77867db5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -120,20 +120,20 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
     cl::init(false));
 
 static cl::opt<bool>
-EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
-                         cl::desc("Enable AArch64 logical imm instruction "
-                                  "optimization"),
-                         cl::init(true));
+    EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
+                             cl::desc("Enable AArch64 logical imm instruction "
+                                      "optimization"),
+                             cl::init(true));
 
 // Temporary option added for the purpose of testing functionality added
 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
 // in future when both implementations will be based off MGATHER rather
 // than the GLD1 nodes added for the SVE gather load intrinsics.
 static cl::opt<bool>
-EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
-                                cl::desc("Combine extends of AArch64 masked "
-                                         "gather intrinsics"),
-                                cl::init(true));
+    EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
+                                   cl::desc("Combine extends of AArch64 masked "
+                                            "gather intrinsics"),
+                                   cl::init(true));
 
 static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
                                     cl::desc("Combine ext and trunc to TBL"),
@@ -272,7 +272,7 @@ static bool isZeroingInactiveLanes(SDValue Op) {
   // Skip bitcasts nodes
   while (Op->getOpcode() == ISD::BITCAST)
     Op = Op->getOperand(0);
-    
+
   switch (Op.getOpcode()) {
   default:
     return false;
@@ -681,35 +681,59 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   else
     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
 
-  for (auto Op : {ISD::FREM,        ISD::FPOW,         ISD::FPOWI,
-                  ISD::FCOS,        ISD::FSIN,         ISD::FSINCOS,
-                  ISD::FEXP,        ISD::FEXP2,        ISD::FEXP10,
-                  ISD::FLOG,        ISD::FLOG2,        ISD::FLOG10,
-                  ISD::STRICT_FREM,
-                  ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS,
-                  ISD::STRICT_FSIN, ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,
-                  ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
+  for (auto Op : {ISD::FREM,         ISD::FPOW,        ISD::FPOWI,
+                  ISD::FCOS,         ISD::FSIN,        ISD::FSINCOS,
+                  ISD::FEXP,         ISD::FEXP2,       ISD::FEXP10,
+                  ISD::FLOG,         ISD::FLOG2,       ISD::FLOG10,
+                  ISD::STRICT_FREM,  ISD::STRICT_FPOW, ISD::STRICT_FPOWI,
+                  ISD::STRICT_FCOS,  ISD::STRICT_FSIN, ISD::STRICT_FEXP,
+                  ISD::STRICT_FEXP2, ISD::STRICT_FLOG, ISD::STRICT_FLOG2,
+                  ISD::STRICT_FLOG10}) {
     setOperationAction(Op, MVT::f16, Promote);
     setOperationAction(Op, MVT::v4f16, Expand);
     setOperationAction(Op, MVT::v8f16, Expand);
   }
 
   if (!Subtarget->hasFullFP16()) {
-    for (auto Op :
-         {ISD::SETCC,          ISD::SELECT_CC,
-          ISD::BR_CC,          ISD::FADD,           ISD::FSUB,
-          ISD::FMUL,           ISD::FDIV,           ISD::FMA,
-          ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
-          ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
-          ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
-          ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
-          ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
-          ISD::STRICT_FSUB,    ISD::STRICT_FMUL,    ISD::STRICT_FDIV,
-          ISD::STRICT_FMA,     ISD::STRICT_FCEIL,   ISD::STRICT_FFLOOR,
-          ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,   ISD::STRICT_FNEARBYINT,
-          ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  ISD::STRICT_FROUNDEVEN,
-          ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
-          ISD::STRICT_FMAXIMUM})
+    for (auto Op : {ISD::SETCC,
+                    ISD::SELECT_CC,
+                    ISD::BR_CC,
+                    ISD::FADD,
+                    ISD::FSUB,
+                    ISD::FMUL,
+                    ISD::FDIV,
+                    ISD::FMA,
+                    ISD::FNEG,
+                    ISD::FABS,
+                    ISD::FCEIL,
+                    ISD::FSQRT,
+                    ISD::FFLOOR,
+                    ISD::FNEARBYINT,
+                    ISD::FRINT,
+                    ISD::FROUND,
+                    ISD::FROUNDEVEN,
+                    ISD::FTRUNC,
+                    ISD::FMINNUM,
+                    ISD::FMAXNUM,
+                    ISD::FMINIMUM,
+                    ISD::FMAXIMUM,
+                    ISD::STRICT_FADD,
+                    ISD::STRICT_FSUB,
+                    ISD::STRICT_FMUL,
+                    ISD::STRICT_FDIV,
+                    ISD::STRICT_FMA,
+                    ISD::STRICT_FCEIL,
+                    ISD::STRICT_FFLOOR,
+                    ISD::STRICT_FSQRT,
+                    ISD::STRICT_FRINT,
+                    ISD::STRICT_FNEARBYINT,
+                    ISD::STRICT_FROUND,
+                    ISD::STRICT_FTRUNC,
+                    ISD::STRICT_FROUNDEVEN,
+                    ISD::STRICT_FMINNUM,
+                    ISD::STRICT_FMAXNUM,
+                    ISD::STRICT_FMINIMUM,
+                    ISD::STRICT_FMAXIMUM})
       setOperationAction(Op, MVT::f16, Promote);
 
     // Round-to-integer need custom lowering for fp16, as Promote doesn't work
@@ -725,44 +749,44 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
     setOperationPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
 
-    setOperationAction(ISD::FABS,        MVT::v4f16, Expand);
-    setOperationAction(ISD::FNEG,        MVT::v4f16, Expand);
-    setOperationAction(ISD::FROUND,      MVT::v4f16, Expand);
-    setOperationAction(ISD::FROUNDEVEN,  MVT::v4f16, Expand);
-    setOperationAction(ISD::FMA,         MVT::v4f16, Expand);
-    setOperationAction(ISD::SETCC,       MVT::v4f16, Custom);
-    setOperationAction(ISD::BR_CC,       MVT::v4f16, Expand);
-    setOperationAction(ISD::SELECT,      MVT::v4f16, Expand);
-    setOperationAction(ISD::SELECT_CC,   MVT::v4f16, Expand);
-    setOperationAction(ISD::FTRUNC,      MVT::v4f16, Expand);
-    setOperationAction(ISD::FCOPYSIGN,   MVT::v4f16, Expand);
-    setOperationAction(ISD::FFLOOR,      MVT::v4f16, Expand);
-    setOperationAction(ISD::FCEIL,       MVT::v4f16, Expand);
-    setOperationAction(ISD::FRINT,       MVT::v4f16, Expand);
-    setOperationAction(ISD::FNEARBYINT,  MVT::v4f16, Expand);
-    setOperationAction(ISD::FSQRT,       MVT::v4f16, Expand);
-
-    setOperationAction(ISD::FABS,        MVT::v8f16, Expand);
-    setOperationAction(ISD::FADD,        MVT::v8f16, Expand);
-    setOperationAction(ISD::FCEIL,       MVT::v8f16, Expand);
-    setOperationAction(ISD::FCOPYSIGN,   MVT::v8f16, Expand);
-    setOperationAction(ISD::FDIV,        MVT::v8f16, Expand);
-    setOperationAction(ISD::FFLOOR,      MVT::v8f16, Expand);
-    setOperationAction(ISD::FMA,         MVT::v8f16, Expand);
-    setOperationAction(ISD::FMUL,        MVT::v8f16, Expand);
-    setOperationAction(ISD::FNEARBYINT,  MVT::v8f16, Expand);
-    setOperationAction(ISD::FNEG,        MVT::v8f16, Expand);
-    setOperationAction(ISD::FROUND,      MVT::v8f16, Expand);
-    setOperationAction(ISD::FROUNDEVEN,  MVT::v8f16, Expand);
-    setOperationAction(ISD::FRINT,       MVT::v8f16, Expand);
-    setOperationAction(ISD::FSQRT,       MVT::v8f16, Expand);
-    setOperationAction(ISD::FSUB,        MVT::v8f16, Expand);
-    setOperationAction(ISD::FTRUNC,      MVT::v8f16, Expand);
-    setOperationAction(ISD::SETCC,       MVT::v8f16, Expand);
-    setOperationAction(ISD::BR_CC,       MVT::v8f16, Expand);
-    setOperationAction(ISD::SELECT,      MVT::v8f16, Expand);
-    setOperationAction(ISD::SELECT_CC,   MVT::v8f16, Expand);
-    setOperationAction(ISD::FP_EXTEND,   MVT::v8f16, Expand);
+    setOperationAction(ISD::FABS, MVT::v4f16, Expand);
+    setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
+    setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
+    setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Expand);
+    setOperationAction(ISD::FMA, MVT::v4f16, Expand);
+    setOperationAction(ISD::SETCC, MVT::v4f16, Custom);
+    setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
+    setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
+    setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
+
+    setOperationAction(ISD::FABS, MVT::v8f16, Expand);
+    setOperationAction(ISD::FADD, MVT::v8f16, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
+    setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
+    setOperationAction(ISD::FMA, MVT::v8f16, Expand);
+    setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
+    setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
+    setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
+    setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Expand);
+    setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
+    setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
+    setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
+    setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
+    setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
   }
 
   // AArch64 has implementations of a lot of rounding-like FP operations.
@@ -1078,21 +1102,45 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   if (Subtarget->hasNEON()) {
     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
     // silliness like this:
-    for (auto Op :
-         {ISD::SELECT,         ISD::SELECT_CC,
-          ISD::BR_CC,          ISD::FADD,           ISD::FSUB,
-          ISD::FMUL,           ISD::FDIV,           ISD::FMA,
-          ISD::FNEG,           ISD::FABS,           ISD::FCEIL,
-          ISD::FSQRT,          ISD::FFLOOR,         ISD::FNEARBYINT,
-          ISD::FRINT,          ISD::FROUND,         ISD::FROUNDEVEN,
-          ISD::FTRUNC,         ISD::FMINNUM,        ISD::FMAXNUM,
-          ISD::FMINIMUM,       ISD::FMAXIMUM,       ISD::STRICT_FADD,
-          ISD::STRICT_FSUB,    ISD::STRICT_FMUL,    ISD::STRICT_FDIV,
-          ISD::STRICT_FMA,     ISD::STRICT_FCEIL,   ISD::STRICT_FFLOOR,
-          ISD::STRICT_FSQRT,   ISD::STRICT_FRINT,   ISD::STRICT_FNEARBYINT,
-          ISD::STRICT_FROUND,  ISD::STRICT_FTRUNC,  ISD::STRICT_FROUNDEVEN,
-          ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
-          ISD::STRICT_FMAXIMUM})
+    for (auto Op : {ISD::SELECT,
+                    ISD::SELECT_CC,
+                    ISD::BR_CC,
+                    ISD::FADD,
+                    ISD::FSUB,
+                    ISD::FMUL,
+                    ISD::FDIV,
+                    ISD::FMA,
+                    ISD::FNEG,
+                    ISD::FABS,
+                    ISD::FCEIL,
+                    ISD::FSQRT,
+                    ISD::FFLOOR,
+                    ISD::FNEARBYINT,
+                    ISD::FRINT,
+                    ISD::FROUND,
+                    ISD::FROUNDEVEN,
+                    ISD::FTRUNC,
+                    ISD::FMINNUM,
+                    ISD::FMAXNUM,
+                    ISD::FMINIMUM,
+                    ISD::FMAXIMUM,
+                    ISD::STRICT_FADD,
+                    ISD::STRICT_FSUB,
+                    ISD::STRICT_FMUL,
+                    ISD::STRICT_FDIV,
+                    ISD::STRICT_FMA,
+                    ISD::STRICT_FCEIL,
+                    ISD::STRICT_FFLOOR,
+                    ISD::STRICT_FSQRT,
+                    ISD::STRICT_FRINT,
+                    ISD::STRICT_FNEARBYINT,
+                    ISD::STRICT_FROUND,
+                    ISD::STRICT_FTRUNC,
+                    ISD::STRICT_FROUNDEVEN,
+                    ISD::STRICT_FMINNUM,
+                    ISD::STRICT_FMAXNUM,
+                    ISD::STRICT_FMINIMUM,
+                    ISD::STRICT_FMAXIMUM})
       setOperationAction(Op, MVT::v1f64, Expand);
 
     for (auto Op :
@@ -1140,8 +1188,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
     }
 
-    setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
-    setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
+    setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
+    setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
     setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
     setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
     setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
@@ -1164,8 +1212,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::MUL, MVT::v1i64, Custom);
 
     // Saturates
-    for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
-                    MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+    for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
+                   MVT::v4i32, MVT::v2i64}) {
       setOperationAction(ISD::SADDSAT, VT, Legal);
       setOperationAction(ISD::UADDSAT, VT, Legal);
       setOperationAction(ISD::SSUBSAT, VT, Legal);
@@ -1183,8 +1231,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
 
     // Vector reductions
-    for (MVT VT : { MVT::v4f16, MVT::v2f32,
-                    MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
+    for (MVT VT :
+         {MVT::v4f16, MVT::v2f32, MVT::v8f16, MVT::v4f32, MVT::v2f64}) {
       if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
         setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);
         setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);
@@ -1194,8 +1242,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
       }
     }
-    for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
-                    MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+    for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
+                   MVT::v4i32}) {
       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
@@ -1262,18 +1310,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
     setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
 
-    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
-    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
 
     // ADDP custom lowering
-    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
+    for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64})
       setOperationAction(ISD::ADD, VT, Custom);
     // FADDP custom lowering
-    for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
+    for (MVT VT : {MVT::v16f16, MVT::v8f32, MVT::v4f64})
       setOperationAction(ISD::FADD, VT, Custom);
   }
 
@@ -1378,8 +1426,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::BITCAST, VT, Custom);
 
     for (auto VT :
-         { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
-           MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
+         {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
+          MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16})
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
 
     for (auto VT :
@@ -1565,11 +1613,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setTruncStoreAction(MVT::v4f64, MVT::v4f32, Custom);
       for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
                      MVT::v4i32, MVT::v1i64, MVT::v2i64})
-        addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
+        addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/true);
 
       for (MVT VT :
            {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
-        addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ true);
+        addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/true);
     }
 
     // NOTE: Currently this has to happen after computeRegisterProperties rather
@@ -1577,10 +1625,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     if (Subtarget->useSVEForFixedLengthVectors()) {
       for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
         if (useSVEForFixedLengthVectorVT(VT))
-          addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
+          addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/false);
       for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
         if (useSVEForFixedLengthVectorVT(VT))
-          addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/ false);
+          addTypeForFixedLengthSVE(VT, /*StreamingSVE=*/false);
 
       // 64bit results can mean a bigger than NEON input.
       for (auto VT : {MVT::v8i8, MVT::v4i16})
@@ -1616,8 +1664,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
 
       // Int operations with no NEON support.
-      for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
-                      MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
+      for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
+                      MVT::v4i32, MVT::v2i64}) {
         setOperationAction(ISD::BITREVERSE, VT, Custom);
         setOperationAction(ISD::CTTZ, VT, Custom);
         setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
@@ -1627,7 +1675,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::MULHU, VT, Custom);
       }
 
-
       // Use SVE for vectors with more than 2 elements.
       for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
         setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
@@ -1768,8 +1815,7 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
 
   // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
   // NEON types.
-  if (VT.isFloatingPoint() &&
-      VT.getVectorElementType() != MVT::bf16 &&
+  if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::bf16 &&
       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
     for (unsigned Opcode :
          {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
@@ -2093,8 +2139,8 @@ static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
   if (NewImm == 0 || NewImm == OrigMask) {
     New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
                           TLO.DAG.getConstant(NewImm, DL, VT));
-  // Otherwise, create a machine node so that target independent DAG combine
-  // doesn't undo this optimization.
+    // Otherwise, create a machine node so that target independent DAG combine
+    // doesn't undo this optimization.
   } else {
     Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
     SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
@@ -2224,7 +2270,8 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
     Intrinsic::ID IntID =
         static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
     switch (IntID) {
-    default: return;
+    default:
+      return;
     case Intrinsic::aarch64_ldaxr:
     case Intrinsic::aarch64_ldxr: {
       unsigned BitWidth = Known.getBitWidth();
@@ -2246,7 +2293,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
       unsigned BitWidth = Known.getBitWidth();
       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
-        unsigned Bound = (VT == MVT::v8i8) ?  11 : 12;
+        unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
         assert(BitWidth >= Bound && "Unexpected width!");
         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - Bound);
         Known.Zero |= Mask;
@@ -2284,26 +2331,26 @@ unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode(
   unsigned VTBits = VT.getScalarSizeInBits();
   unsigned Opcode = Op.getOpcode();
   switch (Opcode) {
-    case AArch64ISD::CMEQ:
-    case AArch64ISD::CMGE:
-    case AArch64ISD::CMGT:
-    case AArch64ISD::CMHI:
-    case AArch64ISD::CMHS:
-    case AArch64ISD::FCMEQ:
-    case AArch64ISD::FCMGE:
-    case AArch64ISD::FCMGT:
-    case AArch64ISD::CMEQz:
-    case AArch64ISD::CMGEz:
-    case AArch64ISD::CMGTz:
-    case AArch64ISD::CMLEz:
-    case AArch64ISD::CMLTz:
-    case AArch64ISD::FCMEQz:
-    case AArch64ISD::FCMGEz:
-    case AArch64ISD::FCMGTz:
-    case AArch64ISD::FCMLEz:
-    case AArch64ISD::FCMLTz:
-      // Compares return either 0 or all-ones
-      return VTBits;
+  case AArch64ISD::CMEQ:
+  case AArch64ISD::CMGE:
+  case AArch64ISD::CMGT:
+  case AArch64ISD::CMHI:
+  case AArch64ISD::CMHS:
+  case AArch64ISD::FCMEQ:
+  case AArch64ISD::FCMGE:
+  case AArch64ISD::FCMGT:
+  case AArch64ISD::CMEQz:
+  case AArch64ISD::CMGEz:
+  case AArch64ISD::CMGTz:
+  case AArch64ISD::CMLEz:
+  case AArch64ISD::CMLTz:
+  case AArch64ISD::FCMEQz:
+  case AArch64ISD::FCMGEz:
+  case AArch64ISD::FCMGTz:
+  case AArch64ISD::FCMLEz:
+  case AArch64ISD::FCMLTz:
+    // Compares return either 0 or all-ones
+    return VTBits;
   }
 
   return 1;
@@ -2758,8 +2805,9 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
   return EndBB;
 }
 
-MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
-       MachineInstr &MI, MachineBasicBlock *BB) const {
+MachineBasicBlock *
+AArch64TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
+                                           MachineBasicBlock *BB) const {
   assert(!isAsynchronousEHPersonality(classifyEHPersonality(
              BB->getParent()->getFunction().getPersonalityFn())) &&
          "SEH does not use catchret!");
@@ -2832,10 +2880,11 @@ MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,
   return BB;
 }
 
-MachineBasicBlock *
-AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
-                                   MachineInstr &MI,
-                                   MachineBasicBlock *BB, bool HasTile) const {
+MachineBasicBlock *AArch64TargetLowering::EmitZAInstr(unsigned Opc,
+                                                      unsigned BaseReg,
+                                                      MachineInstr &MI,
+                                                      MachineBasicBlock *BB,
+                                                      bool HasTile) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
   unsigned StartIdx = 0;
@@ -3230,10 +3279,9 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
       // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
       // of the signed comparisons.
-      const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
-                                           DAG.getVTList(VT, MVT_CC),
-                                           LHS.getOperand(0),
-                                           LHS.getOperand(1));
+      const SDValue ANDSNode =
+          DAG.getNode(AArch64ISD::ANDS, dl, DAG.getVTList(VT, MVT_CC),
+                      LHS.getOperand(0), LHS.getOperand(1));
       // Replace all users of (and X, Y) with newly generated (ands X, Y)
       DAG.ReplaceAllUsesWith(LHS, ANDSNode);
       return ANDSNode.getValue(1);
@@ -3369,11 +3417,11 @@ static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
     SDValue O1 = Val->getOperand(1);
     bool CanNegateL;
     bool MustBeFirstL;
-    if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
+    if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth + 1))
       return false;
     bool CanNegateR;
     bool MustBeFirstR;
-    if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
+    if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth + 1))
       return false;
 
     if (MustBeFirstL && MustBeFirstR)
@@ -3410,8 +3458,8 @@ static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
 /// \p Negate is true if we want this sub-tree being negated just by changing
 /// SETCC conditions.
 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
-    AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
-    AArch64CC::CondCode Predicate) {
+                                  AArch64CC::CondCode &OutCC, bool Negate,
+                                  SDValue CCOp, AArch64CC::CondCode Predicate) {
   // We're at a tree leaf, produce a conditional comparison operation.
   unsigned Opcode = Val->getOpcode();
   if (Opcode == ISD::SETCC) {
@@ -3605,8 +3653,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       case ISD::SETGT:
         if ((VT == MVT::i32 && C != INT32_MAX &&
              isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != INT64_MAX &&
-             isLegalArithImmed(C + 1ULL))) {
+            (VT == MVT::i64 && C != INT64_MAX && isLegalArithImmed(C + 1ULL))) {
           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
           RHS = DAG.getConstant(C, dl, VT);
@@ -3676,9 +3723,9 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
         SDValue SExt =
             DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
                         DAG.getValueType(MVT::i16));
-        Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
-                                                   RHS.getValueType()),
-                             CC, dl, DAG);
+        Cmp = emitComparison(
+            SExt, DAG.getConstant(ValueofRHS, dl, RHS.getValueType()), CC, dl,
+            DAG);
         AArch64CC = changeIntCCToAArch64CC(CC);
       }
     }
@@ -3770,10 +3817,9 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
     } else {
       SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-      Overflow =
-          DAG.getNode(AArch64ISD::SUBS, DL, VTs,
-                      DAG.getConstant(0, DL, MVT::i64),
-                      UpperBits).getValue(1);
+      Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs,
+                             DAG.getConstant(0, DL, MVT::i64), UpperBits)
+                     .getValue(1);
     }
     break;
   }
@@ -3956,8 +4002,8 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   // too. This will allow it to be selected to a single instruction:
   // CSINC Wd, WZR, WZR, invert(cond).
   SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
-  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
-                         CCVal, Overflow);
+  Overflow =
+      DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, CCVal, Overflow);
 
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
@@ -3986,10 +4032,10 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
   }
 
   // built the mask value encoding the expected behavior.
-  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
-                   (!IsData << 3) |     // IsDataCache bit
-                   (Locality << 1) |    // Cache level bits
-                   (unsigned)IsStream;  // Stream bit
+  unsigned PrfOp = (IsWrite << 4) |    // Load/Store bit
+                   (!IsData << 3) |    // IsDataCache bit
+                   (Locality << 1) |   // Cache level bits
+                   (unsigned)IsStream; // Stream bit
   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
                      DAG.getTargetConstant(PrfOp, DL, MVT::i32),
                      Op.getOperand(1));
@@ -4055,8 +4101,7 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
   unsigned NumElts = InVT.getVectorNumElements();
 
   // f16 conversions are promoted to f32 when full fp16 is not supported.
-  if (InVT.getVectorElementType() == MVT::f16 &&
-      !Subtarget->hasFullFP16()) {
+  if (InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) {
     MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
     SDLoc dl(Op);
     if (IsStrict) {
@@ -4138,9 +4183,8 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
       return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
                          {Ext.getValue(1), Ext.getValue(0)});
     }
-    return DAG.getNode(
-        Op.getOpcode(), dl, Op.getValueType(),
-        DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
+    return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(),
+                       DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
   }
 
   if (SrcVal.getValueType() != MVT::f128) {
@@ -4268,8 +4312,8 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
         APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
     Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
   } else {
-    SDValue MinC = DAG.getConstant(
-        APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
+    SDValue MinC =
+        DAG.getConstant(APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
     Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
   }
 
@@ -4314,8 +4358,7 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
                          InVT.getVectorNumElements());
     if (IsStrict) {
-      In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
-                       {Op.getOperand(0), In});
+      In = DAG.getNode(Opc, dl, {CastVT, MVT::Other}, {Op.getOperand(0), In});
       return DAG.getNode(
           ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
           {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
@@ -4337,9 +4380,9 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
   // Use a scalar operation for conversions between single-element vectors of
   // the same size.
   if (VT.getVectorNumElements() == 1) {
-    SDValue Extract = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
-        In, DAG.getConstant(0, dl, MVT::i64));
+    SDValue Extract =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(), In,
+                    DAG.getConstant(0, dl, MVT::i64));
     EVT ScalarVT = VT.getScalarType();
     if (IsStrict)
       return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
@@ -4351,7 +4394,7 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
 }
 
 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
-                                            SelectionDAG &DAG) const {
+                                              SelectionDAG &DAG) const {
   if (Op.getValueType().isVector())
     return LowerVectorINT_TO_FP(Op, DAG);
 
@@ -4368,10 +4411,9 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
           ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other},
           {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
     }
-    return DAG.getNode(
-        ISD::FP_ROUND, dl, MVT::f16,
-        DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
-        DAG.getIntPtrConstant(0, dl));
+    return DAG.getNode(ISD::FP_ROUND, dl, MVT::f16,
+                       DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
+                       DAG.getIntPtrConstant(0, dl));
   }
 
   // i128 conversions are libcalls.
@@ -4403,8 +4445,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
   Entry.IsZExt = false;
   Args.push_back(Entry);
 
-  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
-                                        : RTLIB::SINCOS_STRET_F32;
+  RTLIB::Libcall LC =
+      ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
   const char *LibcallName = getLibcallName(LC);
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
@@ -4472,12 +4514,13 @@ static EVT getExtensionTo64Bits(const EVT &OrigVT) {
 
   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
   switch (OrigSimpleTy) {
-  default: llvm_unreachable("Unexpected Vector Type");
+  default:
+    llvm_unreachable("Unexpected Vector Type");
   case MVT::v2i8:
   case MVT::v2i16:
-     return MVT::v2i32;
+    return MVT::v2i32;
   case MVT::v4i8:
-    return  MVT::v4i16;
+    return MVT::v4i16;
   }
 }
 
@@ -4587,8 +4630,8 @@ static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) {
   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
-    return N0->hasOneUse() && N1->hasOneUse() &&
-      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+    return N0->hasOneUse() && N1->hasOneUse() && isSignExtended(N0, DAG) &&
+           isSignExtended(N1, DAG);
   }
   return false;
 }
@@ -4598,8 +4641,8 @@ static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) {
   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
-    return N0->hasOneUse() && N1->hasOneUse() &&
-      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+    return N0->hasOneUse() && N1->hasOneUse() && isZeroExtended(N0, DAG) &&
+           isZeroExtended(N1, DAG);
   }
   return false;
 }
@@ -4694,8 +4737,7 @@ static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
     else
       ZextOperand = N1.getOperand(0);
     if (DAG.SignBitIsZero(ZextOperand)) {
-      SDValue NewSext =
-          DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
+      SDValue NewSext = DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
       if (IsN0ZExt)
         N0 = NewSext;
       else
@@ -4909,8 +4951,8 @@ SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
   TargetLowering::CallLoweringInfo CLI(DAG);
   ArgListTy Args;
   CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
-      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
-      RetTy, Callee, std::move(Args));
+      CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2, RetTy,
+      Callee, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
   return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
@@ -5076,12 +5118,14 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   }
 }
 
-SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
-                                                     SelectionDAG &DAG) const {
+SDValue
+AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                               SelectionDAG &DAG) const {
   unsigned IntNo = Op.getConstantOperandVal(0);
   SDLoc dl(Op);
   switch (IntNo) {
-  default: return SDValue();    // Don't custom lower most intrinsics.
+  default:
+    return SDValue(); // Don't custom lower most intrinsics.
   case Intrinsic::thread_pointer: {
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
@@ -5089,8 +5133,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_neon_abs: {
     EVT Ty = Op.getValueType();
     if (Ty == MVT::i64) {
-      SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
-                                   Op.getOperand(1));
+      SDValue Result =
+          DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op.getOperand(1));
       Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
       return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
     } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
@@ -5158,17 +5202,17 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
   }
   case Intrinsic::aarch64_neon_smax:
-    return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
+    return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2));
   case Intrinsic::aarch64_neon_umax:
-    return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
+    return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2));
   case Intrinsic::aarch64_neon_smin:
-    return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
+    return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2));
   case Intrinsic::aarch64_neon_umin:
-    return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
+    return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2));
   case Intrinsic::aarch64_neon_scalar_sqxtn:
   case Intrinsic::aarch64_neon_scalar_sqxtun:
   case Intrinsic::aarch64_neon_scalar_uqxtn: {
@@ -5233,8 +5277,8 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
                        Op.getOperand(1));
   case Intrinsic::aarch64_sve_tbl:
-    return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2));
+    return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2));
   case Intrinsic::aarch64_sve_trn1:
     return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
@@ -5309,8 +5353,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
   case Intrinsic::aarch64_sve_frinti:
-    return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
-                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+    return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl,
+                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(1));
   case Intrinsic::aarch64_sve_frintx:
     return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
@@ -5318,8 +5363,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
   case Intrinsic::aarch64_sve_frintn:
-    return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
-                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
+    return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl,
+                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
+                       Op.getOperand(1));
   case Intrinsic::aarch64_sve_frintz:
     return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
@@ -5332,13 +5378,11 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                        Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
                        Op.getOperand(1));
   case Intrinsic::aarch64_sve_fcvtzu:
-    return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
-                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
-                       Op.getOperand(1));
+    return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
   case Intrinsic::aarch64_sve_fcvtzs:
-    return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
-                       Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
-                       Op.getOperand(1));
+    return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
   case Intrinsic::aarch64_sve_fsqrt:
     return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
@@ -5836,9 +5880,8 @@ SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
 }
 
 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
-static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
-                                        EVT VT, EVT MemVT,
-                                        SelectionDAG &DAG) {
+static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT,
+                                        EVT MemVT, SelectionDAG &DAG) {
   assert(VT.isVector() && "VT should be a vector type");
   assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
 
@@ -5852,29 +5895,28 @@ static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
   //   str  s0, [x0]
 
   SDValue Undef = DAG.getUNDEF(MVT::i16);
-  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
-                                        {Undef, Undef, Undef, Undef});
+  SDValue UndefVec =
+      DAG.getBuildVector(MVT::v4i16, DL, {Undef, Undef, Undef, Undef});
 
-  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
-                                 Value, UndefVec);
+  SDValue TruncExt =
+      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Value, UndefVec);
   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
 
   Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
   SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
                                      Trunc, DAG.getConstant(0, DL, MVT::i64));
 
-  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
-                      ST->getBasePtr(), ST->getMemOperand());
+  return DAG.getStore(ST->getChain(), DL, ExtractTrunc, ST->getBasePtr(),
+                      ST->getMemOperand());
 }
 
 // Custom lowering for any store, vector or scalar and/or default or with
 // a truncate operations.  Currently only custom lower truncate operation
 // from vector v4i16 to v4i8 or volatile stores of i128.
-SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
-                                          SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc Dl(Op);
   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
-  assert (StoreNode && "Can only custom lower store nodes");
+  assert(StoreNode && "Can only custom lower store nodes");
 
   SDValue Value = StoreNode->getValue();
 
@@ -5934,8 +5976,8 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
     SDValue Base = StoreNode->getBasePtr();
     EVT PtrVT = Base.getValueType();
     for (unsigned i = 0; i < 8; i++) {
-      SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
-                                 Value, DAG.getConstant(i, Dl, MVT::i32));
+      SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, Value,
+                                 DAG.getConstant(i, Dl, MVT::i32));
       SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
                                 DAG.getConstant(i * 8, Dl, PtrVT));
       Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
@@ -5979,8 +6021,7 @@ SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
   return Result;
 }
 
-SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
-                                         SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   assert(LoadNode && "Expected custom lowering of a load node");
@@ -5993,9 +6034,9 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
     for (unsigned i = 0; i < 8; i++) {
       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
                                 DAG.getConstant(i * 8, DL, PtrVT));
-      SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
-                                 LoadNode->getPointerInfo(),
-                                 LoadNode->getOriginalAlign());
+      SDValue Part =
+          DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
+                      LoadNode->getOriginalAlign());
       Ops.push_back(Part);
       Chain = SDValue(Part.getNode(), 1);
     }
@@ -6043,9 +6084,8 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
                             Op.getOperand(0));
   // Generate SUBS & CSEL.
-  SDValue Cmp =
-      DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
-                  Op.getOperand(0), DAG.getConstant(0, DL, VT));
+  SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+                            Op.getOperand(0), DAG.getConstant(0, DL, VT));
   return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
                      DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
                      Cmp.getValue(1));
@@ -6611,7 +6651,6 @@ AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
   }
 }
 
-
 unsigned
 AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
                                               SelectionDAG &DAG) const {
@@ -6667,7 +6706,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   SmallVector<ISD::OutputArg, 4> Outs;
   GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
                 DAG.getTargetLoweringInfo(), MF.getDataLayout());
-  if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
+  if (any_of(Outs,
+             [](ISD::OutputArg &Out) { return Out.VT.isScalableVector(); }))
     FuncInfo->setIsSVECC(true);
 
   // Assign locations to all of the incoming arguments.
@@ -6728,10 +6768,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       int Size = Ins[i].Flags.getByValSize();
       unsigned NumRegs = (Size + 7) / 8;
 
-      // FIXME: This works on big-endian for composite byvals, which are the common
-      // case. It should also work for fundamental types too.
+      // FIXME: This works on big-endian for composite byvals, which are the
+      // common case. It should also work for fundamental types too.
       unsigned FrameIdx =
-        MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
+          MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
       InVals.push_back(FrameIdxN);
 
@@ -6822,7 +6862,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       unsigned ArgOffset = VA.getLocMemOffset();
       unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
                               ? VA.getLocVT().getSizeInBits()
-                              : VA.getValVT().getSizeInBits()) / 8;
+                              : VA.getValVT().getSizeInBits()) /
+                         8;
 
       uint32_t BEAlign = 0;
       if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
@@ -6885,8 +6926,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     }
 
     if (VA.getLocInfo() == CCValAssign::Indirect) {
-      assert((VA.getValVT().isScalableVT() ||
-              Subtarget->isWindowsArm64EC()) &&
+      assert((VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
              "Indirect arguments should be scalable on most subtargets");
 
       uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
@@ -6965,12 +7005,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 
     // Ensure that the SMSTART happens after the CopyWithChain such that its
     // chain result is used.
-    for (unsigned I=0; I<InVals.size(); ++I) {
+    for (unsigned I = 0; I < InVals.size(); ++I) {
       Register Reg = MF.getRegInfo().createVirtualRegister(
           getRegClassFor(InVals[I].getValueType().getSimpleVT()));
       Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
-      InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
-                                     InVals[I].getValueType());
+      InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg, InVals[I].getValueType());
     }
   }
 
@@ -6980,8 +7019,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       // The AAPCS variadic function ABI is identical to the non-variadic
       // one. As a result there may be more arguments in registers and we should
       // save them for future reference.
-      // Win64 variadic functions also pass arguments in registers, but all float
-      // arguments are passed in integer registers.
+      // Win64 variadic functions also pass arguments in registers, but all
+      // float arguments are passed in integer registers.
       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
     }
 
@@ -6999,7 +7038,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       RegParmTypes.push_back(MVT::f128);
       // Compute the set of forwarded registers. The rest are scratch.
       SmallVectorImpl<ForwardedRegister> &Forwards =
-                                       FuncInfo->getForwardedMustTailRegParms();
+          FuncInfo->getForwardedMustTailRegParms();
       CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
                                                CC_AArch64_AAPCS);
 
@@ -7073,7 +7112,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
-  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
+  bool IsWin64 =
+      Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
 
   SmallVector<SDValue, 8> MemOps;
 
@@ -7093,7 +7133,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
       GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
       if (GPRSaveSize & 15)
         // The extra size here, if triggered, will always be 8.
-        MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
+        MFI.CreateFixedObject(16 - (GPRSaveSize & 15),
+                              -(int)alignTo(GPRSaveSize, 16), false);
     } else
       GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
 
@@ -7281,9 +7322,9 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
 
     if (!UseVarArgCC) {
       // Get type of the original argument.
-      EVT ActualVT =
-          TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
-                       /*AllowUnknown*/ true);
+      EVT ActualVT = TLI.getValueType(DAG.getDataLayout(),
+                                      CLI.Args[Outs[i].OrigArgIndex].Ty,
+                                      /*AllowUnknown*/ true);
       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
@@ -7360,7 +7401,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
       return false;
   }
 
-  if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
+  if (canGuaranteeTCO(CalleeCC,
+                      getTargetMachine().Options.GuaranteedTailCallOpt))
     return CCMatch;
 
   // Externally-defined functions with weak linkage should not be
@@ -7416,10 +7458,11 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
 
   if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
-    // When we are musttail, additional checks have been done and we can safely ignore this check
-    // At least two cases here: if caller is fastcc then we can't have any
-    // memory arguments (we'd be expected to clean up the stack afterwards). If
-    // caller is C then we could potentially use its argument area.
+    // When we are musttail, additional checks have been done and we can safely
+    // ignore this check At least two cases here: if caller is fastcc then we
+    // can't have any memory arguments (we'd be expected to clean up the stack
+    // afterwards). If caller is C then we could potentially use its argument
+    // area.
 
     // FIXME: for now we take the most conservative of these in both cases:
     // disallow all variadic memory operands.
@@ -7520,9 +7563,11 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
         MI.removeOperand(I);
 }
 
-SDValue AArch64TargetLowering::changeStreamingMode(
-    SelectionDAG &DAG, SDLoc DL, bool Enable,
-    SDValue Chain, SDValue InGlue, SDValue PStateSM, bool Entry) const {
+SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
+                                                   bool Enable, SDValue Chain,
+                                                   SDValue InGlue,
+                                                   SDValue PStateSM,
+                                                   bool Entry) const {
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   FuncInfo->setHasStreamingModeChanges(true);
@@ -7693,7 +7738,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (RequiresLazySave) {
     unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
     MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
-    SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
+    SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
+        TPIDR2Obj,
         DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
     SDValue NumZaSaveSlicesAddr =
         DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
@@ -7708,10 +7754,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         TPIDR2ObjAddr);
     OptimizationRemarkEmitter ORE(&MF.getFunction());
     ORE.emit([&]() {
-      auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
-                                                   CLI.CB)
-                      : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
-                                                   &MF.getFunction());
+      auto R = CLI.CB
+                   ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA", CLI.CB)
+                   : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
+                                                &MF.getFunction());
       return DescribeCallsite(R) << " sets up a lazy save for ZA";
     });
   }
@@ -7727,10 +7773,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
     OptimizationRemarkEmitter ORE(&MF.getFunction());
     ORE.emit([&]() {
-      auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
-                                                   CLI.CB)
-                      : OptimizationRemarkAnalysis("sme", "SMETransition",
-                                                   &MF.getFunction());
+      auto R = CLI.CB
+                   ? OptimizationRemarkAnalysis("sme", "SMETransition", CLI.CB)
+                   : OptimizationRemarkAnalysis("sme", "SMETransition",
+                                                &MF.getFunction());
       DescribeCallsite(R) << " requires a streaming mode transition";
       return R;
     });
@@ -7781,7 +7827,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
     for (const auto &F : Forwards) {
       SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
-       RegsToPass.emplace_back(F.PReg, Val);
+      RegsToPass.emplace_back(F.PReg, Val);
     }
   }
 
@@ -8043,8 +8089,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   for (auto &RegToPass : RegsToPass) {
-    Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
-                             RegToPass.second, InGlue);
+    Chain =
+        DAG.getCopyToReg(Chain, DL, RegToPass.first, RegToPass.second, InGlue);
     InGlue = Chain.getValue(1);
   }
 
@@ -8097,8 +8143,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Add argument registers to the end of the list so that they are known live
   // into the call.
   for (auto &RegToPass : RegsToPass)
-    Ops.push_back(DAG.getRegister(RegToPass.first,
-                                  RegToPass.second.getValueType()));
+    Ops.push_back(
+        DAG.getRegister(RegToPass.first, RegToPass.second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
   const uint32_t *Mask;
@@ -8243,8 +8289,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       Register Reg = MF.getRegInfo().createVirtualRegister(
           getRegClassFor(InVals[I].getValueType().getSimpleVT()));
       SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
-      InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
-                                     InVals[I].getValueType());
+      InVals[I] = DAG.getCopyFromReg(X, DL, Reg, InVals[I].getValueType());
     }
   }
 
@@ -8365,7 +8410,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     Glue = Chain.getValue(1);
 
     RetOps.push_back(
-      DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+        DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
   }
 
   const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
@@ -8428,7 +8473,7 @@ SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
                                    N->getOffset(), Flag);
 }
 
-SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
+SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode *N, EVT Ty,
                                              SelectionDAG &DAG,
                                              unsigned Flag) const {
   return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
@@ -8643,8 +8688,7 @@ SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
                                       HiVar,
                                       DAG.getTargetConstant(0, DL, MVT::i32)),
                    0);
-    return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
-                                      LoVar,
+    return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr, LoVar,
                                       DAG.getTargetConstant(0, DL, MVT::i32)),
                    0);
   }
@@ -8851,8 +8895,8 @@ AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
   // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
   // offset into the TLSArray.
   TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
-  SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
-                             DAG.getConstant(3, DL, PtrVT));
+  SDValue Slot =
+      DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, DAG.getConstant(3, DL, PtrVT));
   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
                             MachinePointerInfo());
@@ -9562,8 +9606,8 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
     // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
     // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
     // Both require less instructions than compare and conditional select.
-    if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
-        RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
+    if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal && RHSC &&
+        RHSC->isZero() && CFVal && CFVal->isZero() &&
         LHS.getValueType() == RHS.getValueType()) {
       EVT VT = LHS.getValueType();
       SDValue Shift =
@@ -9677,7 +9721,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
       else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
         FVal = LHS;
     } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
-      assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
+      assert(CTVal && CFVal && "Expected constant operands for CSNEG.");
       // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
       // avoid materializing C.
       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
@@ -9801,8 +9845,7 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
   if (Ty == MVT::aarch64svcount) {
     TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
     FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
-    SDValue Sel =
-        DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
+    SDValue Sel = DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
     return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
   }
 
@@ -9885,8 +9928,7 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
   return getAddr(JT, DAG);
 }
 
-SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
-                                          SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   // Jump table entries as PC relative offsets. No additional tweaking
   // is necessary here. Just get the address of the jump table.
   SDLoc DL(Op);
@@ -9922,7 +9964,7 @@ SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
 }
 
 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
-                                               SelectionDAG &DAG) const {
+                                                 SelectionDAG &DAG) const {
   BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
   CodeModel::Model CM = getTargetMachine().getCodeModel();
   if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
@@ -9935,7 +9977,7 @@ SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
 }
 
 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
-                                                 SelectionDAG &DAG) const {
+                                                   SelectionDAG &DAG) const {
   AArch64FunctionInfo *FuncInfo =
       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
 
@@ -10077,9 +10119,9 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
   SDLoc DL(Op);
   unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
   unsigned VaListSize =
-      (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
-          ? PtrSize
-          : Subtarget->isTargetILP32() ? 20 : 32;
+      (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) ? PtrSize
+      : Subtarget->isTargetILP32()                                  ? 20
+                                                                    : 32;
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
@@ -10151,7 +10193,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
     SDValue NarrowFP =
         DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
                     DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
-    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
+    SDValue Ops[] = {NarrowFP, WideFP.getValue(1)};
     // Merge the rounded value with the chain output of the load.
     return DAG.getMergeValues(Ops, DL);
   }
@@ -10195,8 +10237,9 @@ SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
-Register AArch64TargetLowering::
-getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
+Register
+AArch64TargetLowering::getRegisterByName(const char *RegName, LLT VT,
+                                         const MachineFunction &MF) const {
   Register Reg = MatchRegisterName(RegName);
   if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
     const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
@@ -10207,8 +10250,8 @@ getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const
   }
   if (Reg)
     return Reg;
-  report_fatal_error(Twine("Invalid register name \""
-                              + StringRef(RegName)  + "\"."));
+  report_fatal_error(
+      Twine("Invalid register name \"" + StringRef(RegName) + "\"."));
 }
 
 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
@@ -10314,7 +10357,8 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
   }
 
   LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
-                    << " imm value: "; Imm.dump(););
+                    << " imm value: ";
+             Imm.dump(););
   return IsLegal;
 }
 
@@ -10380,8 +10424,8 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
       // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
       // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
       for (int i = ExtraSteps; i > 0; --i) {
-        SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
-                                   Flags);
+        SDValue Step =
+            DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, Flags);
         Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
       }
@@ -10410,8 +10454,8 @@ SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
       // Newton reciprocal iteration: E * (2 - X * E)
       // AArch64 reciprocal iteration instruction: (2 - M * N)
       for (int i = ExtraSteps; i > 0; --i) {
-        SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
-                                   Estimate, Flags);
+        SDValue Step =
+            DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, Estimate, Flags);
         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
       }
 
@@ -10463,9 +10507,8 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
   if (ConstraintVT.isFloatingPoint())
     return "w";
 
-  if (ConstraintVT.isVector() &&
-     (ConstraintVT.getSizeInBits() == 64 ||
-      ConstraintVT.getSizeInBits() == 128))
+  if (ConstraintVT.isVector() && (ConstraintVT.getSizeInBits() == 64 ||
+                                  ConstraintVT.getSizeInBits() == 128))
     return "w";
 
   return "r";
@@ -11069,10 +11112,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     int WindowScale;
 
     ShuffleSourceInfo(SDValue Vec)
-      : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
+        : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
           ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
 
-    bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+    bool operator==(SDValue OtherVec) { return Vec == OtherVec; }
   };
 
   // First gather all vectors used as an immediate source for this BUILD_VECTOR
@@ -11158,8 +11201,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
       TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
     assert((Mask.size() == 8 || Mask.size() == 16) &&
            "Expected a v8i8 or v16i8 Mask");
-    TBLOperands.push_back(
-        DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
+    TBLOperands.push_back(DAG.getBuildVector(
+        Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
 
     SDValue Shuffle =
         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
@@ -11250,14 +11293,15 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
 
       if (!SrcVT.is64BitVector()) {
         LLVM_DEBUG(
-          dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
-                    "for SVE vectors.");
+            dbgs()
+            << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
+               "for SVE vectors.");
         return SDValue();
       }
 
-      Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
-                                   VEXTSrc2,
-                                   DAG.getConstant(Imm, dl, MVT::i32));
+      Src.ShuffleVec =
+          DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, VEXTSrc2,
+                      DAG.getConstant(Imm, dl, MVT::i32));
       Src.WindowBase = -Src.MinElt;
     }
   }
@@ -11301,8 +11345,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
     // segment.
     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
-    int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
-                               VT.getScalarSizeInBits());
+    int BitsDefined =
+        std::min(OrigEltTy.getScalarSizeInBits(), VT.getScalarSizeInBits());
     int LanesDefined = BitsDefined / BitsPerShuffleLane;
 
     // This source is expected to fill ResMultiplier lanes of the final shuffle,
@@ -11321,12 +11365,12 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     return SDValue();
   }
 
-  SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+  SDValue ShuffleOps[] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
   for (unsigned i = 0; i < Sources.size(); ++i)
     ShuffleOps[i] = Sources[i].ShuffleVec;
 
-  SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
-                                         ShuffleOps[1], Mask);
+  SDValue Shuffle =
+      DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], ShuffleOps[1], Mask);
   SDValue V;
   if (DAG.getDataLayout().isBigEndian()) {
     V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
@@ -11374,7 +11418,8 @@ static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
 // v4i32s. This is really a truncate, which we can construct out of (legal)
 // concats and truncate nodes.
-static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
+static SDValue ReconstructTruncateFromBuildVector(SDValue V,
+                                                  SelectionDAG &DAG) {
   if (V.getValueType() != MVT::v16i8)
     return SDValue();
   assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
@@ -11661,8 +11706,8 @@ static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   return true;
 }
 
-static bool isINSMask(ArrayRef<int> M, int NumInputElements,
-                      bool &DstIsLeft, int &Anomaly) {
+static bool isINSMask(ArrayRef<int> M, int NumInputElements, bool &DstIsLeft,
+                      int &Anomaly) {
   if (M.size() != static_cast<size_t>(NumInputElements))
     return false;
 
@@ -11750,11 +11795,11 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
 
 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
 /// the specified operations to build the shuffle. ID is the perfect-shuffle
-//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
-//table entry and LHS/RHS are the immediate inputs for this stage of the
-//shuffle.
-static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
-                                      SDValue V2, unsigned PFEntry, SDValue LHS,
+// ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
+// table entry and LHS/RHS are the immediate inputs for this stage of the
+// shuffle.
+static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2,
+                                      unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
                                       const SDLoc &dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
@@ -12256,9 +12301,9 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   for (unsigned LaneSize : {64U, 32U, 16U}) {
     unsigned Lane = 0;
     if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
-      unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
-                                       : LaneSize == 32 ? AArch64ISD::DUPLANE32
-                                                        : AArch64ISD::DUPLANE16;
+      unsigned Opcode = LaneSize == 64   ? AArch64ISD::DUPLANE64
+                        : LaneSize == 32 ? AArch64ISD::DUPLANE32
+                                         : AArch64ISD::DUPLANE16;
       // Cast V1 to an integer vector with required lane size
       MVT NewEltTy = MVT::getIntegerVT(LaneSize);
       unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
@@ -12458,7 +12503,6 @@ SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
   return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
 }
 
-
 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
                                APInt &UndefBits) {
   EVT VT = BVN->getValueType(0);
@@ -12483,7 +12527,7 @@ static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
 
 // Try 64-bit splatted SIMD immediate.
 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
-                                 const APInt &Bits) {
+                                  const APInt &Bits) {
   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
     EVT VT = Op.getValueType();
@@ -12493,8 +12537,8 @@ static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
       Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
 
       SDLoc dl(Op);
-      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
-                                DAG.getConstant(Value, dl, MVT::i32));
+      SDValue Mov =
+          DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32));
       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
   }
@@ -12520,16 +12564,13 @@ static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
       Shift = 0;
-    }
-    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
+    } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
       Shift = 8;
-    }
-    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
+    } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
       Shift = 16;
-    }
-    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
+    } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
       Shift = 24;
     }
@@ -12544,9 +12585,9 @@ static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
                           DAG.getConstant(Value, dl, MVT::i32),
                           DAG.getConstant(Shift, dl, MVT::i32));
       else
-        Mov = DAG.getNode(NewOp, dl, MovTy,
-                          DAG.getConstant(Value, dl, MVT::i32),
-                          DAG.getConstant(Shift, dl, MVT::i32));
+        Mov =
+            DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32),
+                        DAG.getConstant(Shift, dl, MVT::i32));
 
       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
@@ -12573,8 +12614,7 @@ static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
       Shift = 0;
-    }
-    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
+    } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
       Shift = 8;
     }
@@ -12589,9 +12629,9 @@ static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
                           DAG.getConstant(Value, dl, MVT::i32),
                           DAG.getConstant(Shift, dl, MVT::i32));
       else
-        Mov = DAG.getNode(NewOp, dl, MovTy,
-                          DAG.getConstant(Value, dl, MVT::i32),
-                          DAG.getConstant(Shift, dl, MVT::i32));
+        Mov =
+            DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32),
+                        DAG.getConstant(Shift, dl, MVT::i32));
 
       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
@@ -12613,17 +12653,16 @@ static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
       Shift = 264;
-    }
-    else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
+    } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
       Shift = 272;
     }
 
     if (isAdvSIMDModImm) {
       SDLoc dl(Op);
-      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
-                                DAG.getConstant(Value, dl, MVT::i32),
-                                DAG.getConstant(Shift, dl, MVT::i32));
+      SDValue Mov =
+          DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32),
+                      DAG.getConstant(Shift, dl, MVT::i32));
       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
   }
@@ -12643,8 +12682,8 @@ static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
       Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
 
       SDLoc dl(Op);
-      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
-                                DAG.getConstant(Value, dl, MVT::i32));
+      SDValue Mov =
+          DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32));
       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
   }
@@ -12665,17 +12704,16 @@ static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
       MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
-    }
-    else if (isWide &&
-             (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
+    } else if (isWide &&
+               (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
       MovTy = MVT::v2f64;
     }
 
     if (isAdvSIMDModImm) {
       SDLoc dl(Op);
-      SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
-                                DAG.getConstant(Value, dl, MVT::i32));
+      SDValue Mov =
+          DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32));
       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
   }
@@ -12887,16 +12925,15 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
     SDValue NewOp;
 
-    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
-                                    DefBits, &LHS)) ||
-        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
-                                    DefBits, &LHS)))
+    if ((NewOp =
+             tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, DefBits, &LHS)) ||
+        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, DefBits, &LHS)))
       return NewOp;
 
-    if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
-                                    UndefBits, &LHS)) ||
-        (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
-                                    UndefBits, &LHS)))
+    if ((NewOp =
+             tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, UndefBits, &LHS)) ||
+        (NewOp =
+             tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, UndefBits, &LHS)))
       return NewOp;
   }
 
@@ -12906,12 +12943,11 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
 
 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
 // be truncated to fit element width.
-static SDValue NormalizeBuildVector(SDValue Op,
-                                    SelectionDAG &DAG) {
+static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
-  EVT EltTy= VT.getVectorElementType();
+  EVT EltTy = VT.getVectorElementType();
 
   if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
     return Op;
@@ -12923,8 +12959,7 @@ static SDValue NormalizeBuildVector(SDValue Op,
     // (with operands cast to integers), then the only possibilities
     // are constants and UNDEFs.
     if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
-      APInt LowBits(EltTy.getSizeInBits(),
-                    CstLane->getZExtValue());
+      APInt LowBits(EltTy.getSizeInBits(), CstLane->getZExtValue());
       Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
     } else if (Lane.getNode()->isUndef()) {
       Lane = DAG.getUNDEF(MVT::i32);
@@ -13206,8 +13241,9 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
     if (VT.getVectorElementType().isFloatingPoint()) {
       SmallVector<SDValue, 8> Ops;
       EVT EltTy = VT.getVectorElementType();
-      assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
-               EltTy == MVT::f64) && "Unsupported floating-point vector type");
+      assert((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
+              EltTy == MVT::f64) &&
+             "Unsupported floating-point vector type");
       LLVM_DEBUG(
           dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
                     "BITCASTS, and try again\n");
@@ -13812,8 +13848,9 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
-  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
-                                    HasAnyUndefs, ElementBits) ||
+  if (!BVN ||
+      !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
+                            ElementBits) ||
       SplatBitSize > ElementBits)
     return false;
   Cnt = SplatBits.getSExtValue();
@@ -13931,10 +13968,10 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
       return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
                          DAG.getConstant(Cnt, DL, MVT::i32));
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
-                                       MVT::i32),
-                       Op.getOperand(0), Op.getOperand(1));
+    return DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, VT,
+        DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32),
+        Op.getOperand(0), Op.getOperand(1));
   case ISD::SRA:
   case ISD::SRL:
     if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
@@ -13969,10 +14006,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
     // negate the shift amount
     SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
                                    Op.getOperand(1));
-    SDValue NegShiftLeft =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                    DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
-                    NegShift);
+    SDValue NegShiftLeft = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                                       DAG.getConstant(Opc, DL, MVT::i32),
+                                       Op.getOperand(0), NegShift);
     return NegShiftLeft;
   }
 
@@ -14130,7 +14166,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
   }
 
   assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
-          LHS.getValueType().getVectorElementType() != MVT::f128);
+         LHS.getValueType().getVectorElementType() != MVT::f128);
 
   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   // clean.  Some of them require two branches to implement.
@@ -14138,15 +14174,14 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
   bool ShouldInvert;
   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
 
-  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
-  SDValue Cmp =
-      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
+  bool NoNaNs =
+      getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
+  SDValue Cmp = EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
   if (!Cmp.getNode())
     return SDValue();
 
   if (CC2 != AArch64CC::AL) {
-    SDValue Cmp2 =
-        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
+    SDValue Cmp2 = EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
     if (!Cmp2.getNode())
       return SDValue();
 
@@ -14392,8 +14427,8 @@ AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
-                                               PtrVT, 0);
+  SDValue Callee =
+      DAG.getTargetExternalSymbol(Subtarget->getChkStkName(), PtrVT, 0);
 
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
   const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
@@ -14761,7 +14796,7 @@ bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
     // The shift can be combined if it matches the size of the value being
     // loaded (and so reducing the width would make it not match).
     uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
-    uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
+    uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits() / 8;
     if (ShiftAmount == Log2_32(LoadBytes))
       return false;
   }
@@ -14822,10 +14857,10 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
   const DataLayout &DL = F->getParent()->getDataLayout();
   Type *Ty = User->getOperand(0)->getType();
 
-  return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
-           isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
-           (Options.AllowFPOpFusion == FPOpFusion::Fast ||
-            Options.UnsafeFPMath));
+  return !(
+      isFMAFasterThanFMulAndFAdd(*F, Ty) &&
+      isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
 }
 
 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
@@ -14884,7 +14919,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
     case Instruction::GetElementPtr: {
       gep_type_iterator GTI = gep_type_begin(Instr);
       auto &DL = Ext->getModule()->getDataLayout();
-      std::advance(GTI, U.getOperandNo()-1);
+      std::advance(GTI, U.getOperandNo() - 1);
       Type *IdxTy = GTI.getIndexedType();
       // This extension will end up with a shift because of the scaling factor.
       // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
@@ -15231,7 +15266,8 @@ bool AArch64TargetLowering::shouldSinkOperands(
               I->getParent() != IB->getParent())
             return false;
 
-          Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
+          Ops.push_back(
+              &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
           Ops.push_back(&I->getOperandUse(0));
           Ops.push_back(&I->getOperandUse(1));
 
@@ -16257,8 +16293,8 @@ bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
   }
   // Same encoding for add/sub, just flip the sign.
   Immed = std::abs(Immed);
-  bool IsLegal = ((Immed >> 12) == 0 ||
-                  ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
+  bool IsLegal =
+      ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
   LLVM_DEBUG(dbgs() << "Is " << Immed
                     << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
   return IsLegal;
@@ -16302,8 +16338,9 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
-                                                  const AddrMode &AMode, Type *Ty,
-                                                  unsigned AS, Instruction *I) const {
+                                                  const AddrMode &AMode,
+                                                  Type *Ty, unsigned AS,
+                                                  Instruction *I) const {
   // AArch64 has five basic addressing modes:
   //  reg
   //  reg + 9-bit signed offset
@@ -16424,9 +16461,8 @@ AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
   // LR is a callee-save register, but we must treat it as clobbered by any call
   // site. Hence we include LR in the scratch registers, which are in turn added
   // as implicit-defs for stackmaps and patchpoints.
-  static const MCPhysReg ScratchRegs[] = {
-    AArch64::X16, AArch64::X17, AArch64::LR, 0
-  };
+  static const MCPhysReg ScratchRegs[] = {AArch64::X16, AArch64::X17,
+                                          AArch64::LR, 0};
   return ScratchRegs;
 }
 
@@ -16435,9 +16471,8 @@ ArrayRef<MCPhysReg> AArch64TargetLowering::getRoundingControlRegisters() const {
   return RCRegs;
 }
 
-bool
-AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
-                                                     CombineLevel Level) const {
+bool AArch64TargetLowering::isDesirableToCommuteWithShift(
+    const SDNode *N, CombineLevel Level) const {
   assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
           N->getOpcode() == ISD::SRL) &&
          "Expected shift op");
@@ -16886,7 +16921,7 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                      SmallVectorImpl<SDNode *> &Created) const {
   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
   if (isIntDivCheap(N->getValueType(0), Attr))
-    return SDValue(N,0); // Lower SDIV as SDIV
+    return SDValue(N, 0); // Lower SDIV as SDIV
 
   EVT VT = N->getValueType(0);
 
@@ -16958,7 +16993,7 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
 }
 
 static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
-  switch(getIntrinsicID(S.getNode())) {
+  switch (getIntrinsicID(S.getNode())) {
   default:
     break;
   case Intrinsic::aarch64_sve_cntb:
@@ -17193,11 +17228,10 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
 
   // Allow the scaling to be folded into the `cnt` instruction by preventing
   // the scaling to be obscured here. This makes it easier to pattern match.
-  if (IsSVECntIntrinsic(N0) ||
-     (N0->getOpcode() == ISD::TRUNCATE &&
-      (IsSVECntIntrinsic(N0->getOperand(0)))))
-       if (ConstValue.sge(1) && ConstValue.sle(16))
-         return SDValue();
+  if (IsSVECntIntrinsic(N0) || (N0->getOpcode() == ISD::TRUNCATE &&
+                                (IsSVECntIntrinsic(N0->getOperand(0)))))
+    if (ConstValue.sge(1) && ConstValue.sle(16))
+      return SDValue();
 
   // Multiplication of a power of two plus/minus one can be done more
   // cheaply as shift+add/sub. For now, this is true unilaterally. If
@@ -17215,8 +17249,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
   if (TrailingZeroes) {
     // Conservatively do not lower to shift+add+shift if the mul might be
     // folded into smul or umul.
-    if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
-                            isZeroExtended(N0, DAG)))
+    if (N0->hasOneUse() && (isSignExtended(N0, DAG) || isZeroExtended(N0, DAG)))
       return SDValue();
     // Conservatively do not lower to shift+add+shift if the mul might be
     // folded into madd or msub.
@@ -17393,8 +17426,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
     // to use the new Chain.
     DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
 
-    unsigned Opcode =
-        (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
+    unsigned Opcode = (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF
+                                                          : AArch64ISD::UITOF;
     return DAG.getNode(Opcode, SDLoc(N), VT, Load);
   }
 
@@ -17507,7 +17540,8 @@ static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
 
   BitVector UndefElements;
   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
-  int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
+  int32_t C =
+      BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
   if (C == -1 || C == 0 || C > FloatBits)
     return SDValue();
 
@@ -17819,8 +17853,8 @@ static SDValue performSVEAndCombine(SDNode *N,
     Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
                       DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
 
-    SDValue And = DAG.getNode(ISD::AND, DL,
-                              UnpkOp->getValueType(0), UnpkOp, Dup);
+    SDValue And =
+        DAG.getNode(ISD::AND, DL, UnpkOp->getValueType(0), UnpkOp, Dup);
 
     return DAG.getNode(Opc, DL, N->getValueType(0), And);
   }
@@ -17930,7 +17964,7 @@ static SDValue performANDCombine(SDNode *N,
   if (SDValue R = performANDORCSELCombine(N, DAG))
     return R;
 
-  if (SDValue R = performANDSETCCCombine(N,DCI))
+  if (SDValue R = performANDSETCCCombine(N, DCI))
     return R;
 
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
@@ -18176,8 +18210,7 @@ performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
       // with the strict_fadd, but we also need uses of the chain output of the
       // original strict_fadd to use the chain output of the new strict_fadd as
       // otherwise it may not be deleted.
-      SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
-                                {VT, MVT::Other},
+      SDValue Ret = DAG.getNode(N0->getOpcode(), DL, {VT, MVT::Other},
                                 {N0->getOperand(0), Extract1, Extract2});
       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
       DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
@@ -18254,9 +18287,8 @@ static SDValue performConcatVectorsCombine(SDNode *N,
           Ops.push_back(DAG.getUNDEF(MVT::f32));
         else {
           LoadSDNode *LD = cast<LoadSDNode>(V);
-          SDValue NewLoad =
-              DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
-                          LD->getMemOperand());
+          SDValue NewLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(),
+                                        LD->getBasePtr(), LD->getMemOperand());
           DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
           Ops.push_back(NewLoad);
         }
@@ -18670,11 +18702,11 @@ static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
 }
 
 // Returns true if Op is setcc or zext of setcc.
-static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
+static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info) {
   if (isSetCC(Op, Info))
     return true;
   return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
-    isSetCC(Op->getOperand(0), Info));
+          isSetCC(Op->getOperand(0), Info));
 }
 
 // The folding we want to perform is:
@@ -19097,7 +19129,8 @@ static SDValue performBuildVectorCombine(SDNode *N,
     if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
       return SDValue();
 
-    SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
+    SDValue SubvectorIdx =
+        DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
 
     SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
@@ -19107,8 +19140,7 @@ static SDValue performBuildVectorCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue performTruncateCombine(SDNode *N,
-                                      SelectionDAG &DAG) {
+static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
@@ -19359,8 +19391,10 @@ static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) {
     // are lowered. Note that this only comes up because we do not always visit
     // operands before uses. After that is fixed this can be removed and in the
     // meantime this is fairly specific to the lowering we expect from IR.
-    // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
-    //   t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
+    // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44,
+    // t45
+    //   t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42,
+    //   t43
     //     t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
     //       t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
     //       t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
@@ -19759,8 +19793,7 @@ static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
                                            SelectionDAG &DAG) {
   SDLoc dl(N);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
-                     DAG.getNode(Opc, dl,
-                                 N->getOperand(1).getSimpleValueType(),
+                     DAG.getNode(Opc, dl, N->getOperand(1).getSimpleValueType(),
                                  N->getOperand(1)),
                      DAG.getConstant(0, dl, MVT::i64));
 }
@@ -19862,7 +19895,7 @@ static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
     case Intrinsic::aarch64_sve_cmphs_wide:
     case Intrinsic::aarch64_sve_cmphi_wide:
     case Intrinsic::aarch64_sve_cmplo_wide:
-    case Intrinsic::aarch64_sve_cmpls_wide:  {
+    case Intrinsic::aarch64_sve_cmpls_wide: {
       if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
         uint64_t ImmVal = CN->getZExtValue();
         if (ImmVal <= 127)
@@ -20428,9 +20461,8 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
   uint64_t BaseOffset = 0;
 
   const MachinePointerInfo &PtrInfo = St.getPointerInfo();
-  SDValue NewST1 =
-      DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
-                   OrigAlignment, St.getMemOperand()->getFlags());
+  SDValue NewST1 = DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
+                                OrigAlignment, St.getMemOperand()->getFlags());
 
   // As this in ISel, we will not merge this add which may degrade results.
   if (BasePtr->getOpcode() == ISD::ADD &&
@@ -20495,10 +20527,10 @@ static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
     ContainerVT = getSVEContainerType(ContainerVT);
 
   SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
-  SDValue Ops[] = { N->getOperand(0), // Chain
-                    N->getOperand(2), // Pg
-                    N->getOperand(3), // Base
-                    DAG.getValueType(VT) };
+  SDValue Ops[] = {N->getOperand(0), // Chain
+                   N->getOperand(2), // Pg
+                   N->getOperand(3), // Base
+                   DAG.getValueType(VT)};
 
   SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
   SDValue LoadChain = SDValue(Load.getNode(), 1);
@@ -20506,7 +20538,7 @@ static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
   if (ContainerVT.isInteger() && (VT != ContainerVT))
     Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
 
-  return DAG.getMergeValues({ Load, LoadChain }, DL);
+  return DAG.getMergeValues({Load, LoadChain}, DL);
 }
 
 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
@@ -20520,16 +20552,16 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
 
   auto *MINode = cast<MemIntrinsicSDNode>(N);
   SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
-  SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
-                                MINode->getOperand(3), DAG.getUNDEF(PtrTy),
-                                MINode->getOperand(2), PassThru,
-                                MINode->getMemoryVT(), MINode->getMemOperand(),
-                                ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
+  SDValue L =
+      DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(), MINode->getOperand(3),
+                        DAG.getUNDEF(PtrTy), MINode->getOperand(2), PassThru,
+                        MINode->getMemoryVT(), MINode->getMemOperand(),
+                        ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
 
-   if (VT.isFloatingPoint()) {
-     SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
-     return DAG.getMergeValues(Ops, DL);
-   }
+  if (VT.isFloatingPoint()) {
+    SDValue Ops[] = {DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1)};
+    return DAG.getMergeValues(Ops, DL);
+  }
 
   return L;
 }
@@ -20572,12 +20604,11 @@ static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
   else
     SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
 
-  SDValue Ops[] = { N->getOperand(0), // Chain
-                    SrcNew,
-                    N->getOperand(4), // Base
-                    N->getOperand(3), // Pg
-                    InputVT
-                  };
+  SDValue Ops[] = {N->getOperand(0), // Chain
+                   SrcNew,
+                   N->getOperand(4), // Base
+                   N->getOperand(3), // Pg
+                   InputVT};
 
   return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
 }
@@ -20729,7 +20760,7 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
   }
   // Check that all vector element locations were inserted to.
   if (IndexNotInserted.any())
-      return SDValue();
+    return SDValue();
 
   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
 }
@@ -21250,11 +21281,12 @@ static SDValue performPostLD1Combine(SDNode *N,
   SDValue Addr = LD->getOperand(1);
   SDValue Vector = N->getOperand(0);
   // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
-       Addr.getNode()->use_end(); UI != UE; ++UI) {
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+                            UE = Addr.getNode()->use_end();
+       UI != UE; ++UI) {
     SDNode *User = *UI;
-    if (User->getOpcode() != ISD::ADD
-        || UI.getUse().getResNo() != Addr.getResNo())
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
       continue;
 
     // If the increment is a constant, it must match the memory ref size.
@@ -21280,19 +21312,19 @@ static SDValue performPostLD1Combine(SDNode *N,
       continue;
 
     SmallVector<SDValue, 8> Ops;
-    Ops.push_back(LD->getOperand(0));  // Chain
+    Ops.push_back(LD->getOperand(0)); // Chain
     if (IsLaneOp) {
-      Ops.push_back(Vector);           // The vector to be inserted
-      Ops.push_back(Lane);             // The lane to be inserted in the vector
+      Ops.push_back(Vector); // The vector to be inserted
+      Ops.push_back(Lane);   // The lane to be inserted in the vector
     }
     Ops.push_back(Addr);
     Ops.push_back(Inc);
 
-    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
+    EVT Tys[3] = {VT, MVT::i64, MVT::Other};
     SDVTList SDTys = DAG.getVTList(Tys);
-    unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
-                                           MemVT,
+    unsigned NewOp =
+        IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, MemVT,
                                            LoadSDN->getMemOperand());
 
     // Update the uses.
@@ -21301,8 +21333,8 @@ static SDValue performPostLD1Combine(SDNode *N,
         SDValue(UpdN.getNode(), 2) // Chain
     };
     DCI.CombineTo(LD, NewResults);
-    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
-    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
+    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));    // Dup/Inserted Result
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
 
     break;
   }
@@ -21964,7 +21996,8 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
 
   // Search for a use of the address operand that is an increment.
   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
-       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+                            UE = Addr.getNode()->use_end();
+       UI != UE; ++UI) {
     SDNode *User = *UI;
     if (User->getOpcode() != ISD::ADD ||
         UI.getUse().getResNo() != Addr.getResNo())
@@ -21989,49 +22022,110 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
     unsigned NumVecs = 0;
     unsigned IntNo = N->getConstantOperandVal(1);
     switch (IntNo) {
-    default: llvm_unreachable("unexpected intrinsic for Neon base update");
-    case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
-      NumVecs = 2; break;
-    case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
-      NumVecs = 3; break;
-    case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
-      NumVecs = 4; break;
-    case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
-      NumVecs = 2; IsStore = true; break;
-    case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
-      NumVecs = 3; IsStore = true; break;
-    case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
-      NumVecs = 4; IsStore = true; break;
-    case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
-      NumVecs = 2; break;
-    case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
-      NumVecs = 3; break;
-    case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
-      NumVecs = 4; break;
-    case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
-      NumVecs = 2; IsStore = true; break;
-    case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
-      NumVecs = 3; IsStore = true; break;
-    case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
-      NumVecs = 4; IsStore = true; break;
-    case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
-      NumVecs = 2; IsDupOp = true; break;
-    case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
-      NumVecs = 3; IsDupOp = true; break;
-    case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
-      NumVecs = 4; IsDupOp = true; break;
-    case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
-      NumVecs = 2; IsLaneOp = true; break;
-    case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
-      NumVecs = 3; IsLaneOp = true; break;
-    case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
-      NumVecs = 4; IsLaneOp = true; break;
-    case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
-      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
-    case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
-      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
-    case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
-      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
+    default:
+      llvm_unreachable("unexpected intrinsic for Neon base update");
+    case Intrinsic::aarch64_neon_ld2:
+      NewOpc = AArch64ISD::LD2post;
+      NumVecs = 2;
+      break;
+    case Intrinsic::aarch64_neon_ld3:
+      NewOpc = AArch64ISD::LD3post;
+      NumVecs = 3;
+      break;
+    case Intrinsic::aarch64_neon_ld4:
+      NewOpc = AArch64ISD::LD4post;
+      NumVecs = 4;
+      break;
+    case Intrinsic::aarch64_neon_st2:
+      NewOpc = AArch64ISD::ST2post;
+      NumVecs = 2;
+      IsStore = true;
+      break;
+    case Intrinsic::aarch64_neon_st3:
+      NewOpc = AArch64ISD::ST3post;
+      NumVecs = 3;
+      IsStore = true;
+      break;
+    case Intrinsic::aarch64_neon_st4:
+      NewOpc = AArch64ISD::ST4post;
+      NumVecs = 4;
+      IsStore = true;
+      break;
+    case Intrinsic::aarch64_neon_ld1x2:
+      NewOpc = AArch64ISD::LD1x2post;
+      NumVecs = 2;
+      break;
+    case Intrinsic::aarch64_neon_ld1x3:
+      NewOpc = AArch64ISD::LD1x3post;
+      NumVecs = 3;
+      break;
+    case Intrinsic::aarch64_neon_ld1x4:
+      NewOpc = AArch64ISD::LD1x4post;
+      NumVecs = 4;
+      break;
+    case Intrinsic::aarch64_neon_st1x2:
+      NewOpc = AArch64ISD::ST1x2post;
+      NumVecs = 2;
+      IsStore = true;
+      break;
+    case Intrinsic::aarch64_neon_st1x3:
+      NewOpc = AArch64ISD::ST1x3post;
+      NumVecs = 3;
+      IsStore = true;
+      break;
+    case Intrinsic::aarch64_neon_st1x4:
+      NewOpc = AArch64ISD::ST1x4post;
+      NumVecs = 4;
+      IsStore = true;
+      break;
+    case Intrinsic::aarch64_neon_ld2r:
+      NewOpc = AArch64ISD::LD2DUPpost;
+      NumVecs = 2;
+      IsDupOp = true;
+      break;
+    case Intrinsic::aarch64_neon_ld3r:
+      NewOpc = AArch64ISD::LD3DUPpost;
+      NumVecs = 3;
+      IsDupOp = true;
+      break;
+    case Intrinsic::aarch64_neon_ld4r:
+      NewOpc = AArch64ISD::LD4DUPpost;
+      NumVecs = 4;
+      IsDupOp = true;
+      break;
+    case Intrinsic::aarch64_neon_ld2lane:
+      NewOpc = AArch64ISD::LD2LANEpost;
+      NumVecs = 2;
+      IsLaneOp = true;
+      break;
+    case Intrinsic::aarch64_neon_ld3lane:
+      NewOpc = AArch64ISD::LD3LANEpost;
+      NumVecs = 3;
+      IsLaneOp = true;
+      break;
+    case Intrinsic::aarch64_neon_ld4lane:
+      NewOpc = AArch64ISD::LD4LANEpost;
+      NumVecs = 4;
+      IsLaneOp = true;
+      break;
+    case Intrinsic::aarch64_neon_st2lane:
+      NewOpc = AArch64ISD::ST2LANEpost;
+      NumVecs = 2;
+      IsStore = true;
+      IsLaneOp = true;
+      break;
+    case Intrinsic::aarch64_neon_st3lane:
+      NewOpc = AArch64ISD::ST3LANEpost;
+      NumVecs = 3;
+      IsStore = true;
+      IsLaneOp = true;
+      break;
+    case Intrinsic::aarch64_neon_st4lane:
+      NewOpc = AArch64ISD::ST4LANEpost;
+      NumVecs = 4;
+      IsStore = true;
+      IsLaneOp = true;
+      break;
     }
 
     EVT VecTy;
@@ -22066,14 +22160,14 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
     unsigned n;
     for (n = 0; n < NumResultVecs; ++n)
       Tys[n] = VecTy;
-    Tys[n++] = MVT::i64;  // Type of write back register
-    Tys[n] = MVT::Other;  // Type of the chain
+    Tys[n++] = MVT::i64; // Type of write back register
+    Tys[n] = MVT::Other; // Type of the chain
     SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
 
     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
-                                           MemInt->getMemoryVT(),
-                                           MemInt->getMemOperand());
+    SDValue UpdN =
+        DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
+                                MemInt->getMemoryVT(), MemInt->getMemOperand());
 
     // Update the uses.
     std::vector<SDValue> NewResults;
@@ -22091,16 +22185,16 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
 
 // Checks to see if the value is the prescribed width and returns information
 // about its extension mode.
-static
-bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
+static bool checkValueWidth(SDValue V, unsigned width,
+                            ISD::LoadExtType &ExtType) {
   ExtType = ISD::NON_EXTLOAD;
-  switch(V.getNode()->getOpcode()) {
+  switch (V.getNode()->getOpcode()) {
   default:
     return false;
   case ISD::LOAD: {
     LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
-    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
-       || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
+    if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) ||
+        (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
       ExtType = LoadNode->getExtensionType();
       return true;
     }
@@ -22108,8 +22202,8 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
   }
   case ISD::AssertSext: {
     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
-    if ((TypeNode->getVT() == MVT::i8 && width == 8)
-       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+    if ((TypeNode->getVT() == MVT::i8 && width == 8) ||
+        (TypeNode->getVT() == MVT::i16 && width == 16)) {
       ExtType = ISD::SEXTLOAD;
       return true;
     }
@@ -22117,8 +22211,8 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
   }
   case ISD::AssertZext: {
     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
-    if ((TypeNode->getVT() == MVT::i8 && width == 8)
-       || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+    if ((TypeNode->getVT() == MVT::i8 && width == 8) ||
+        (TypeNode->getVT() == MVT::i16 && width == 16)) {
       ExtType = ISD::ZEXTLOAD;
       return true;
     }
@@ -22209,9 +22303,9 @@ static bool isEquivalentMaskless(unsigned CC, unsigned width,
   // the whole range we can just adjust the input and avoid writing equations
   // for sign extended inputs.
   if (ExtType == ISD::SEXTLOAD)
-    AddConstant -= (1 << (width-1));
+    AddConstant -= (1 << (width - 1));
 
-  switch(CC) {
+  switch (CC) {
   case AArch64CC::LE:
   case AArch64CC::GT:
     if ((AddConstant == 0) ||
@@ -22222,22 +22316,20 @@ static bool isEquivalentMaskless(unsigned CC, unsigned width,
     break;
   case AArch64CC::LT:
   case AArch64CC::GE:
-    if ((AddConstant == 0) ||
-        (AddConstant >= 0 && CompConstant <= 0) ||
+    if ((AddConstant == 0) || (AddConstant >= 0 && CompConstant <= 0) ||
         (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
       return true;
     break;
   case AArch64CC::HI:
   case AArch64CC::LS:
     if ((AddConstant >= 0 && CompConstant < 0) ||
-       (AddConstant <= 0 && CompConstant >= -1 &&
-        CompConstant < AddConstant + MaxUInt))
+        (AddConstant <= 0 && CompConstant >= -1 &&
+         CompConstant < AddConstant + MaxUInt))
       return true;
-   break;
+    break;
   case AArch64CC::PL:
   case AArch64CC::MI:
-    if ((AddConstant == 0) ||
-        (AddConstant > 0 && CompConstant <= 0) ||
+    if ((AddConstant == 0) || (AddConstant > 0 && CompConstant <= 0) ||
         (AddConstant < 0 && CompConstant <= AddConstant))
       return true;
     break;
@@ -22318,11 +22410,10 @@ static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode,
   return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
 }
 
-static
-SDValue performCONDCombine(SDNode *N,
-                           TargetLowering::DAGCombinerInfo &DCI,
-                           SelectionDAG &DAG, unsigned CCIndex,
-                           unsigned CmpIndex) {
+static SDValue performCONDCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  SelectionDAG &DAG, unsigned CCIndex,
+                                  unsigned CmpIndex) {
   unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
   SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
   unsigned CondOpcode = SubsNode->getOpcode();
@@ -22376,19 +22467,20 @@ SDValue performCONDCombine(SDNode *N,
 
   if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
       !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
-      !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
+      !checkValueWidth(AddInputValue1, MaskBits, ExtType))
     return SDValue();
 
-  if(!isEquivalentMaskless(CC, MaskBits, ExtType,
-                cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
-                cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
+  if (!isEquivalentMaskless(
+          CC, MaskBits, ExtType,
+          cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
+          cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
     return SDValue();
 
   // The AND is not necessary, remove it.
 
-  SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
-                               SubsNode->getValueType(1));
-  SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
+  SDVTList VTs =
+      DAG.getVTList(SubsNode->getValueType(0), SubsNode->getValueType(1));
+  SDValue Ops[] = {AddValue, SubsNode->getOperand(1)};
 
   SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
   DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
@@ -22574,7 +22666,7 @@ static SDValue performCSELCombine(SDNode *N,
   // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
   // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
   if (SDValue Folded = foldCSELofCTTZ(N, DAG))
-		return Folded;
+    return Folded;
 
   return performCONDCombine(N, DCI, DAG, 2, 3);
 }
@@ -22702,8 +22794,8 @@ static SDValue performSETCCCombine(SDNode *N,
     if (FromVT.isFixedLengthVector() &&
         FromVT.getVectorElementType() == MVT::i1) {
       bool IsNull = isNullConstant(RHS);
-      LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
-                        DL, MVT::i1, LHS->getOperand(0));
+      LHS = DAG.getNode(IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND, DL,
+                        MVT::i1, LHS->getOperand(0));
       LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
                         LHS);
       return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
@@ -23032,8 +23124,7 @@ static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
   SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
                        N0.getOperand(0), N0.getOperand(1),
                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
-  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
-                     IfTrue, IfFalse);
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, IfTrue, IfFalse);
 }
 
 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
@@ -23090,17 +23181,15 @@ static SDValue performSelectCombine(SDNode *N,
   // First perform a vector comparison, where lane 0 is the one we're interested
   // in.
   SDLoc DL(N0);
-  SDValue LHS =
-      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
-  SDValue RHS =
-      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
+  SDValue LHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
+  SDValue RHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
   SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
 
   // Now duplicate the comparison mask we want across all other lanes.
   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
-  Mask = DAG.getNode(ISD::BITCAST, DL,
-                     ResVT.changeVectorElementTypeToInteger(), Mask);
+  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
+                     Mask);
 
   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
 }
@@ -23495,8 +23584,8 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   // Sign extend of an unsigned unpack -> signed unpack
   if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
 
-    unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
-                                               : AArch64ISD::SUNPKLO;
+    unsigned SOpc =
+        Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI : AArch64ISD::SUNPKLO;
 
     // Push the sign extend to the operand of the unpack
     // This is necessary where, for example, the operand of the unpack
@@ -24727,12 +24816,12 @@ static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
   SDLoc dl(V.getNode());
   auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
   if (DAG.getDataLayout().isBigEndian())
-    std::swap (VLo, VHi);
+    std::swap(VLo, VHi);
   SDValue RegClass =
       DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
   SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
   SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
-  const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
+  const SDValue Ops[] = {RegClass, VLo, SubReg0, VHi, SubReg1};
   return SDValue(
       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
 }
@@ -24751,8 +24840,8 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
     SDValue Ops[] = {
         createGPRPairNode(DAG, N->getOperand(2)), // Compare value
         createGPRPairNode(DAG, N->getOperand(3)), // Store value
-        N->getOperand(1), // Ptr
-        N->getOperand(0), // Chain in
+        N->getOperand(1),                         // Ptr
+        N->getOperand(0),                         // Chain in
     };
 
     unsigned Opcode;
@@ -25062,8 +25151,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
 
     if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
         LoadNode->getMemoryVT() != MVT::i128) {
-      // Non-volatile or atomic loads are optimized later in AArch64's load/store
-      // optimizer.
+      // Non-volatile or atomic loads are optimized later in AArch64's
+      // load/store optimizer.
       return;
     }
 
@@ -25112,30 +25201,30 @@ void AArch64TargetLowering::ReplaceNodeResults(
     case Intrinsic::aarch64_sve_clasta_n: {
       SDLoc DL(N);
       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
-      auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
-                           N->getOperand(1), Op2, N->getOperand(3));
+      auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32, N->getOperand(1),
+                           Op2, N->getOperand(3));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
     case Intrinsic::aarch64_sve_clastb_n: {
       SDLoc DL(N);
       auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
-      auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
-                           N->getOperand(1), Op2, N->getOperand(3));
+      auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32, N->getOperand(1),
+                           Op2, N->getOperand(3));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
     case Intrinsic::aarch64_sve_lasta: {
       SDLoc DL(N);
-      auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
-                           N->getOperand(1), N->getOperand(2));
+      auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32, N->getOperand(1),
+                           N->getOperand(2));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
     case Intrinsic::aarch64_sve_lastb: {
       SDLoc DL(N);
-      auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
-                           N->getOperand(1), N->getOperand(2));
+      auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32, N->getOperand(1),
+                           N->getOperand(2));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
       return;
     }
@@ -25426,7 +25515,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
   }
 
-  Type *Tys[] = { Addr->getType() };
+  Type *Tys[] = {Addr->getType()};
   Intrinsic::ID Int =
       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
   Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
@@ -25469,11 +25558,12 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
 
   Intrinsic::ID Int =
       IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
-  Type *Tys[] = { Addr->getType() };
+  Type *Tys[] = {Addr->getType()};
   Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
 
   const DataLayout &DL = M->getDataLayout();
-  IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
+  IntegerType *IntValTy =
+      Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
   Val = Builder.CreateBitCast(Val, IntValTy);
 
   CallInst *CI = Builder.CreateCall(
@@ -25587,7 +25677,7 @@ bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
   // may be beneficial to sink in other cases, but we would have to check that
   // the cmp would not get folded into the br to form a cbz for these to be
   // beneficial.
-  ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+  ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
   if (!Mask)
     return false;
   return Mask->getValue().isPowerOf2();
@@ -25648,9 +25738,9 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
     // nounwind. If we want to generalize this later, we may need to emit
     // CFI pseudo-instructions.
-    assert(Entry->getParent()->getFunction().hasFnAttribute(
-               Attribute::NoUnwind) &&
-           "Function should be nounwind in insertCopiesSplitCSR!");
+    assert(
+        Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
+        "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
@@ -25754,9 +25844,7 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
 }
 
 // Unlike X86, we let frame lowering assign offsets to all catch objects.
-bool AArch64TargetLowering::needsFixedCatchObjects() const {
-  return false;
-}
+bool AArch64TargetLowering::needsFixedCatchObjects() const { return false; }
 
 bool AArch64TargetLowering::shouldLocalize(
     const MachineInstr &MI, const TargetTransformInfo *TTI) const {
@@ -26340,9 +26428,9 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
 
   SmallVector<SDValue, 4> Operands = {Pg};
   for (const SDValue &V : Op->op_values()) {
-    assert((!V.getValueType().isVector() ||
-            V.getValueType().isScalableVector()) &&
-           "Only scalable vectors are supported!");
+    assert(
+        (!V.getValueType().isVector() || V.getValueType().isScalableVector()) &&
+        "Only scalable vectors are supported!");
     Operands.push_back(V);
   }
 
@@ -26384,8 +26472,9 @@ SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
   return convertFromScalableVector(DAG, VT, ScalableRes);
 }
 
-SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
-    SelectionDAG &DAG) const {
+SDValue
+AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
+                                               SelectionDAG &DAG) const {
   SDLoc DL(ScalarOp);
   SDValue AccOp = ScalarOp.getOperand(0);
   SDValue VecOp = ScalarOp.getOperand(1);
@@ -26406,14 +26495,15 @@ SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
                       DAG.getUNDEF(ContainerVT), AccOp, Zero);
 
   // Perform reduction.
-  SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
-                            Pg, AccOp, VecOp);
+  SDValue Rdx =
+      DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT, Pg, AccOp, VecOp);
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
 }
 
-SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
-                                                       SelectionDAG &DAG) const {
+SDValue
+AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
+                                               SelectionDAG &DAG) const {
   SDLoc DL(ReduceOp);
   SDValue Op = ReduceOp.getOperand(0);
   EVT OpVT = Op.getValueType();
@@ -26470,16 +26560,16 @@ SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
   }
 
   // UADDV always returns an i64 result.
-  EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
-                                                   SrcVT.getVectorElementType();
+  EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64
+                                                 : SrcVT.getVectorElementType();
   EVT RdxVT = SrcVT;
   if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
     RdxVT = getPackedSVEVectorVT(ResVT);
 
   SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
   SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
-  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
-                            Rdx, DAG.getConstant(0, DL, MVT::i64));
+  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx,
+                            DAG.getConstant(0, DL, MVT::i64));
 
   // The VEC_REDUCE nodes expect an element size result.
   if (ResVT != ScalarOp.getValueType())
@@ -26488,9 +26578,8 @@ SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
   return Res;
 }
 
-SDValue
-AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
-    SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
 
@@ -26507,8 +26596,7 @@ AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
   Mask = DAG.getNode(ISD::TRUNCATE, DL,
                      MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
 
-  auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
-                                Mask, Op1, Op2);
+  auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT, Mask, Op1, Op2);
 
   return convertFromScalableVector(DAG, VT, ScalableRes);
 }
@@ -26597,16 +26685,16 @@ AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
   SDValue Pg = getPredicateForVector(DAG, DL, VT);
   EVT SrcVT = Val.getValueType();
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-  EVT ExtendVT = ContainerVT.changeVectorElementType(
-      SrcVT.getVectorElementType());
+  EVT ExtendVT =
+      ContainerVT.changeVectorElementType(SrcVT.getVectorElementType());
 
   Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
   Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
 
   Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
   Val = getSVESafeBitCast(ExtendVT, Val, DAG);
-  Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
-                    Pg, Val, DAG.getUNDEF(ContainerVT));
+  Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, Pg,
+                    Val, DAG.getUNDEF(ContainerVT));
 
   return convertFromScalableVector(DAG, VT, Val);
 }
@@ -26621,8 +26709,8 @@ AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
   SDValue Val = Op.getOperand(0);
   EVT SrcVT = Val.getValueType();
   EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
-  EVT RoundVT = ContainerSrcVT.changeVectorElementType(
-      VT.getVectorElementType());
+  EVT RoundVT =
+      ContainerSrcVT.changeVectorElementType(VT.getVectorElementType());
   SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
 
   Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
@@ -26725,7 +26813,7 @@ AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
 
   if (VT.bitsGT(SrcVT)) {
     EVT CvtVT = ContainerDstVT.changeVectorElementType(
-      ContainerSrcVT.getVectorElementType());
+        ContainerSrcVT.getVectorElementType());
     SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
 
     Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
@@ -26985,8 +27073,8 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
   SDLoc DL(Op);
   EVT InVT = Op.getValueType();
 
-  assert(VT.isScalableVector() && isTypeLegal(VT) &&
-         InVT.isScalableVector() && isTypeLegal(InVT) &&
+  assert(VT.isScalableVector() && isTypeLegal(VT) && InVT.isScalableVector() &&
+         isTypeLegal(InVT) &&
          "Only expect to cast between legal scalable vector types!");
   assert(VT.getVectorElementType() != MVT::i1 &&
          InVT.getVectorElementType() != MVT::i1 &&
@@ -27199,7 +27287,6 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
                               Intrinsic::aarch64_neon_vcmla_rot180,
                               Intrinsic::aarch64_neon_vcmla_rot270};
 
-
     return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
                              {Accumulator, InputA, InputB});
   }

>From ec9659a158a5cc8c7299ef15f97b4564e518fb73 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Wed, 28 Feb 2024 11:57:53 +0000
Subject: [PATCH 4/4] Removed seeing through bitcasts

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 90f68a77867db5..9ecd61ba7132bb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -269,10 +269,6 @@ static bool isMergePassthruOpcode(unsigned Opc) {
 
 // Returns true if inactive lanes are known to be zeroed by construction.
 static bool isZeroingInactiveLanes(SDValue Op) {
-  // Skip bitcasts nodes
-  while (Op->getOpcode() == ISD::BITCAST)
-    Op = Op->getOperand(0);
-
   switch (Op.getOpcode()) {
   default:
     return false;