[llvm] a5f0525 - [AArch64][SelectionDAG] Enable new partial reduction lowering by default (#143565)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 16 08:47:59 PDT 2025


Author: Nicholas Guy
Date: 2025-06-16T16:47:55+01:00
New Revision: a5f0525d4b3edba50706cb0e4b9a48f0691e2b4c

URL: https://github.com/llvm/llvm-project/commit/a5f0525d4b3edba50706cb0e4b9a48f0691e2b4c
DIFF: https://github.com/llvm/llvm-project/commit/a5f0525d4b3edba50706cb0e4b9a48f0691e2b4c.diff

LOG: [AArch64][SelectionDAG] Enable new partial reduction lowering by default (#143565)

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
    llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
    llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
    llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7519ac5260a64..c86aed7b38c8c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -153,13 +153,6 @@ cl::opt<bool> EnableSVEGISel(
     cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
     cl::init(false));
 
-// FIXME : This is a temporary flag, and is used to help transition to
-// performing lowering the proper way using the new PARTIAL_REDUCE_MLA ISD
-// nodes.
-static cl::opt<bool> EnablePartialReduceNodes(
-    "aarch64-enable-partial-reduce-nodes", cl::init(false), cl::ReallyHidden,
-    cl::desc("Use the new method of lowering partial reductions."));
-
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -1457,7 +1450,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
       setOperationAction(ISD::FADD, VT, Custom);
 
-    if (EnablePartialReduceNodes && Subtarget->hasDotProd()) {
+    if (Subtarget->hasDotProd()) {
       static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
                                         ISD::PARTIAL_REDUCE_UMLA};
 
@@ -1895,7 +1888,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   }
 
   // Handle partial reduction operations
-  if (EnablePartialReduceNodes && Subtarget->isSVEorStreamingSVEAvailable()) {
+  if (Subtarget->isSVEorStreamingSVEAvailable()) {
     // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
     // Other pairs will default to 'Expand'.
     static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
@@ -1957,17 +1950,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
                          Custom);
 
-      if (EnablePartialReduceNodes) {
-        static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
-                                          ISD::PARTIAL_REDUCE_UMLA};
-        // Must be lowered to SVE instructions.
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
-      }
+      static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+                                        ISD::PARTIAL_REDUCE_UMLA};
+      // Must be lowered to SVE instructions.
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
     }
   }
 
@@ -2165,16 +2156,6 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
   assert(I->getIntrinsicID() ==
              Intrinsic::experimental_vector_partial_reduce_add &&
          "Unexpected intrinsic!");
-  if (EnablePartialReduceNodes)
-    return true;
-
-  EVT VT = EVT::getEVT(I->getType());
-  auto Op1 = I->getOperand(1);
-  EVT Op1VT = EVT::getEVT(Op1->getType());
-  if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
-      (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
-       VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
-    return false;
   return true;
 }
 
@@ -2252,37 +2233,32 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
   bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
 
-  if (EnablePartialReduceNodes) {
-    static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
-                                      ISD::PARTIAL_REDUCE_UMLA};
-    unsigned NumElts = VT.getVectorNumElements();
-    if (VT.getVectorElementType() == MVT::i64) {
-      setPartialReduceMLAAction(MLAOps, VT,
-                                MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
-    } else if (VT.getVectorElementType() == MVT::i32) {
-      setPartialReduceMLAAction(MLAOps, VT,
+  static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+                                    ISD::PARTIAL_REDUCE_UMLA};
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.getVectorElementType() == MVT::i64) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
+  } else if (VT.getVectorElementType() == MVT::i32) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
+  } else if (VT.getVectorElementType() == MVT::i16) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
+  }
+  if (Subtarget->hasMatMulInt8()) {
+    if (VT.getVectorElementType() == MVT::i32)
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
                                 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
-    } else if (VT.getVectorElementType() == MVT::i16) {
-      setPartialReduceMLAAction(MLAOps, VT,
-                                MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
-    }
-
-    if (Subtarget->hasMatMulInt8()) {
-      if (VT.getVectorElementType() == MVT::i32)
-        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
-                                  MVT::getVectorVT(MVT::i8, NumElts * 4),
-                                  Custom);
-      else if (VT.getVectorElementType() == MVT::i64)
-        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
-                                  MVT::getVectorVT(MVT::i8, NumElts * 8),
-                                  Custom);
-    }
+    else if (VT.getVectorElementType() == MVT::i64)
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
+                                MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
   }
 
   // Lower fixed length vector operations to scalable equivalents.

diff  --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 0c7b3c7d3c138..0ea80a075fae9 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -1,15 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NEWLOWERING-I8MM
+; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NODOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-DOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-DOT-I8MM
 
 define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-DOT-LABEL: udot:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    umull v3.8h, v2.8b, v1.8b
@@ -19,6 +13,16 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -27,22 +31,6 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
-; CHECK-DOT-LABEL: udot_in_loop:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-DOT-NEXT:    mov x8, xzr
-; CHECK-DOT-NEXT:  .LBB1_1: // %vector.body
-; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
-; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
-; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
-; CHECK-DOT-NEXT:    add x8, x8, #16
-; CHECK-DOT-NEXT:    udot v1.4s, v2.16b, v3.16b
-; CHECK-DOT-NEXT:    cmp x8, #16
-; CHECK-DOT-NEXT:    b.ne .LBB1_1
-; CHECK-DOT-NEXT:  // %bb.2: // %end
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_in_loop:
 ; CHECK-NODOT:       // %bb.0: // %entry
 ; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
@@ -63,6 +51,38 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
 ; CHECK-NODOT-NEXT:    b.ne .LBB1_1
 ; CHECK-NODOT-NEXT:  // %bb.2: // %end
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB1_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    b.ne .LBB1_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB1_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB1_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -86,11 +106,6 @@ end:
 }
 
 define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
-; CHECK-DOT-LABEL: udot_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    udot v0.2s, v2.8b, v1.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    umull v1.8h, v2.8b, v1.8b
@@ -105,6 +120,16 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -113,11 +138,6 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 }
 
 define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-DOT-LABEL: sdot:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    smull v3.8h, v2.8b, v1.8b
@@ -127,6 +147,16 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -135,11 +165,6 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
-; CHECK-DOT-LABEL: sdot_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    sdot v0.2s, v2.8b, v1.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    smull v1.8h, v2.8b, v1.8b
@@ -154,6 +179,16 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -162,27 +197,34 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 }
 
 define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-NOI8MM-LABEL: usdot:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -191,60 +233,67 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){
-; CHECK-NOI8MM-LABEL: usdot_in_loop:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #16
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v4.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v2.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB6_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_in_loop:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB6_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #16
+; CHECK-NODOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB6_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    b.ne .LBB6_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB6_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-DOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-DOT-NEXT:    b.ne .LBB6_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB6_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB6_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB6_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -268,32 +317,44 @@ end:
 }
 
 define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
-; CHECK-NOI8MM-LABEL: usdot_narrow:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
-; CHECK-NOI8MM-NEXT:    smull2 v1.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_narrow:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-DOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-DOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-DOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-DOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-DOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_narrow:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -302,27 +363,34 @@ define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 }
 
 define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
-; CHECK-NOI8MM-LABEL: sudot:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %s.wide = sext <16 x i8> %u to <16 x i32>
   %u.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %u.wide, %s.wide
@@ -331,60 +399,67 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
 }
 
 define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){
-; CHECK-NOI8MM-LABEL: sudot_in_loop:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB9_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #16
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v4.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v2.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB9_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot_in_loop:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB9_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #16
+; CHECK-NODOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB9_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB9_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    b.ne .LBB9_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB9_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-DOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-DOT-NEXT:    b.ne .LBB9_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB9_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB9_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB9_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB9_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -408,32 +483,44 @@ end:
 }
 
 define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
-; CHECK-NOI8MM-LABEL: sudot_narrow:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
-; CHECK-NOI8MM-NEXT:    smull2 v1.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot_narrow:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-DOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-DOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-DOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-DOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-DOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot_narrow:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -460,21 +547,21 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NODOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: udot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: udot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: udot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %b.wide = zext <16 x i8> %b to <16 x i64>
@@ -503,21 +590,21 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
 ; CHECK-NODOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sdot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sdot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sdot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %b.wide = sext <16 x i8> %b to <16 x i64>
@@ -528,45 +615,61 @@ entry:
 }
 
 define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
-; CHECK-NOI8MM-LABEL: usdot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v7.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v16.4s, v5.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v17.4s, v3.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v4.4s, v4.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v5.4s, v5.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v16.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v7.2s, v17.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v2.2s, v3.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_8to64:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    ushll v6.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    ushll v7.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    sshll v16.4s, v5.4h, #0
+; CHECK-NODOT-NEXT:    sshll v17.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v4.4s, v4.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v5.4s, v5.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-NODOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    ushll v6.4s, v4.4h, #0
+; CHECK-DOT-NEXT:    ushll v7.4s, v2.4h, #0
+; CHECK-DOT-NEXT:    sshll v16.4s, v5.4h, #0
+; CHECK-DOT-NEXT:    sshll v17.4s, v3.4h, #0
+; CHECK-DOT-NEXT:    ushll2 v4.4s, v4.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v5.4s, v5.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-DOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-DOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %b.wide = sext <16 x i8> %b to <16 x i64>
@@ -577,45 +680,61 @@ entry:
 }
 
 define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
-; CHECK-NOI8MM-LABEL: sudot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v7.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v16.4s, v5.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v17.4s, v3.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v4.4s, v4.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v5.4s, v5.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v16.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v7.2s, v17.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v2.2s, v3.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot_8to64:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    sshll v6.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    sshll v7.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    ushll v16.4s, v5.4h, #0
+; CHECK-NODOT-NEXT:    ushll v17.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v5.4s, v5.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-NODOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    sshll v6.4s, v4.4h, #0
+; CHECK-DOT-NEXT:    sshll v7.4s, v2.4h, #0
+; CHECK-DOT-NEXT:    ushll v16.4s, v5.4h, #0
+; CHECK-DOT-NEXT:    ushll v17.4s, v3.4h, #0
+; CHECK-DOT-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v5.4s, v5.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-DOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-DOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %b.wide = zext <16 x i8> %b to <16 x i64>
@@ -626,12 +745,6 @@ entry:
 }
 
 define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
-; CHECK-DOT-LABEL: udot_no_bin_op:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.16b, #1
-; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_no_bin_op:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    ushll v2.8h, v1.8b, #0
@@ -641,77 +754,53 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.16b, #1
+; CHECK-DOT-I8MM-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <16 x i8> %a to <16 x i32>
   %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
   ret <4 x i32> %partial.reduce
 }
 
 define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){
-; CHECK-NODOT-LABEL: udot_no_bin_op_in_loop:
-; CHECK-NODOT:       // %bb.0: // %entry
-; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NODOT-NEXT:    mov x8, xzr
-; CHECK-NODOT-NEXT:  .LBB16_1: // %vector.body
-; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
-; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
-; CHECK-NODOT-NEXT:    add x8, x8, #16
-; CHECK-NODOT-NEXT:    cmp x8, #16
-; CHECK-NODOT-NEXT:    ushll v3.8h, v2.8b, #0
-; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v3.4h
-; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v3.8h
-; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v2.4h
-; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
-; CHECK-NODOT-NEXT:    b.ne .LBB16_1
-; CHECK-NODOT-NEXT:  // %bb.2: // %end
-; CHECK-NODOT-NEXT:    ret
-;
-; CHECK-I8MM-LABEL: udot_no_bin_op_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    movi v2.16b, #1
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB16_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q3, [x0, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    udot v1.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    b.ne .LBB16_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
-;
-; CHECK-NEWLOWERING-I8MM-LABEL: udot_no_bin_op_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x8, .LCPI16_0
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x9, .LCPI16_2
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x8, .LCPI16_1
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x10, .LCPI16_3
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x8, :lo12:.LCPI16_1]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q4, [x9, :lo12:.LCPI16_2]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q5, [x10, :lo12:.LCPI16_3]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB16_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q6, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v7.16b, { v6.16b }, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v16.16b, { v6.16b }, v4.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v17.16b, { v6.16b }, v5.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v6.16b, { v6.16b }, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v2.4s, v2.4s, v17.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v7.4s, v16.4s, v7.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v2.4s, v2.4s, v7.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB16_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-COMMON-LABEL: udot_no_bin_op_in_loop:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    adrp x8, .LCPI16_0
+; CHECK-COMMON-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-COMMON-NEXT:    adrp x9, .LCPI16_2
+; CHECK-COMMON-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
+; CHECK-COMMON-NEXT:    adrp x8, .LCPI16_1
+; CHECK-COMMON-NEXT:    adrp x10, .LCPI16_3
+; CHECK-COMMON-NEXT:    ldr q3, [x8, :lo12:.LCPI16_1]
+; CHECK-COMMON-NEXT:    ldr q4, [x9, :lo12:.LCPI16_2]
+; CHECK-COMMON-NEXT:    ldr q5, [x10, :lo12:.LCPI16_3]
+; CHECK-COMMON-NEXT:    mov x8, xzr
+; CHECK-COMMON-NEXT:  .LBB16_1: // %vector.body
+; CHECK-COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-COMMON-NEXT:    ldr q6, [x0, x8]
+; CHECK-COMMON-NEXT:    mov v0.16b, v2.16b
+; CHECK-COMMON-NEXT:    add x8, x8, #16
+; CHECK-COMMON-NEXT:    cmp x8, #16
+; CHECK-COMMON-NEXT:    tbl v7.16b, { v6.16b }, v3.16b
+; CHECK-COMMON-NEXT:    tbl v16.16b, { v6.16b }, v4.16b
+; CHECK-COMMON-NEXT:    tbl v17.16b, { v6.16b }, v5.16b
+; CHECK-COMMON-NEXT:    tbl v6.16b, { v6.16b }, v1.16b
+; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v17.4s
+; CHECK-COMMON-NEXT:    add v7.4s, v16.4s, v7.4s
+; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v7.4s
+; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v6.4s
+; CHECK-COMMON-NEXT:    b.ne .LBB16_1
+; CHECK-COMMON-NEXT:  // %bb.2: // %end
+; CHECK-COMMON-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -731,12 +820,6 @@ end:
 }
 
 define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
-; CHECK-DOT-LABEL: sdot_no_bin_op:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.16b, #1
-; CHECK-DOT-NEXT:    sdot v0.4s, v1.16b, v2.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_no_bin_op:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    sshll v2.8h, v1.8b, #0
@@ -746,18 +829,24 @@ define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.16b, #1
+; CHECK-DOT-I8MM-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <16 x i8> %a to <16 x i32>
   %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
   ret <4 x i32> %partial.reduce
 }
 
 define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
-; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    udot v0.2s, v1.8b, v2.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_no_bin_op_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
@@ -772,18 +861,24 @@ define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    udot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.8b, #1
+; CHECK-DOT-I8MM-NEXT:    udot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <8 x i8> %a to <8 x i32>
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
   ret <2 x i32> %partial.reduce
 }
 
 define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
-; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    sdot v0.2s, v1.8b, v2.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
@@ -798,6 +893,18 @@ define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    sdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.8b, #1
+; CHECK-DOT-I8MM-NEXT:    sdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <8 x i8> %a to <8 x i32>
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
   ret <2 x i32> %partial.reduce
@@ -822,23 +929,23 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    uaddw2 v0.2d, v0.2d, v3.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: udot_no_bin_op_8to64:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: udot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_8to64:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v3.16b, #1
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
   ret <4 x i64> %partial.reduce
@@ -863,35 +970,35 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    saddw2 v0.2d, v0.2d, v3.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sdot_no_bin_op_8to64:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sdot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op_8to64:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v3.16b, #1
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
   ret <4 x i64> %partial.reduce
 }
 
 define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
-; CHECK-LABEL: not_udot:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    umull v1.8h, v2.8b, v1.8b
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: not_udot:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    umull v1.8h, v2.8b, v1.8b
+; CHECK-COMMON-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-COMMON-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-COMMON-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -900,16 +1007,16 @@ define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 }
 
 define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
-; CHECK-LABEL: not_udot_narrow:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    bic v2.4h, #255, lsl #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umull v3.4s, v2.4h, v1.4h
-; CHECK-NEXT:    umlal v0.4s, v2.4h, v1.4h
-; CHECK-NEXT:    ext v1.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    add v0.2s, v1.2s, v0.2s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: not_udot_narrow:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-COMMON-NEXT:    bic v2.4h, #255, lsl #8
+; CHECK-COMMON-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-COMMON-NEXT:    umull v3.4s, v2.4h, v1.4h
+; CHECK-COMMON-NEXT:    umlal v0.4s, v2.4h, v1.4h
+; CHECK-COMMON-NEXT:    ext v1.16b, v3.16b, v3.16b, #8
+; CHECK-COMMON-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-COMMON-NEXT:    ret
   %u.wide = zext <4 x i8> %u to <4 x i32>
   %s.wide = zext <4 x i8> %s to <4 x i32>
   %mult = mul nuw nsw <4 x i32> %s.wide, %u.wide
@@ -918,18 +1025,18 @@ define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
 }
 
 define <2 x i64> @udot_
diff erent_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: udot_
diff erent_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    umlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    umlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: udot_
diff erent_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    umlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    umlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    umlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = zext <8 x i16> %a to <8 x i64>
   %b.wide = zext <8 x i8> %b to <8 x i64>
@@ -939,18 +1046,18 @@ entry:
 }
 
 define <2 x i64> @sdot_
diff erent_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: sdot_
diff erent_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: sdot_
diff erent_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = sext <8 x i16> %a to <8 x i64>
   %b.wide = sext <8 x i8> %b to <8 x i64>
@@ -960,18 +1067,18 @@ entry:
 }
 
 define <2 x i64> @usdot_
diff erent_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: usdot_
diff erent_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: usdot_
diff erent_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = zext <8 x i16> %a to <8 x i64>
   %b.wide = sext <8 x i8> %b to <8 x i64>
@@ -981,18 +1088,18 @@ entry:
 }
 
 define <2 x i64> @sudot_
diff erent_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: sudot_
diff erent_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: sudot_
diff erent_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = sext <8 x i16> %a to <8 x i64>
   %b.wide = zext <8 x i8> %b to <8 x i64>
@@ -1002,74 +1109,86 @@ entry:
 }
 
 define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) {
-; CHECK-NOI8MM-LABEL: usdot_multiple_zext_users:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB28_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    ldr q4, [x2, x8]
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v6.8h, v4.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v7.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v4.8h, v4.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #1024
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v6.4h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v7.4h, v6.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v3.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB28_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_multiple_zext_users:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB28_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    ldr q4, [x2, x8]
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    sshll v5.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll v6.8h, v4.8b, #0
+; CHECK-NODOT-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v4.8h, v4.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #1024
+; CHECK-NODOT-NEXT:    smlal v0.4s, v5.4h, v6.4h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v7.4h, v6.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v4.4h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v3.4h, v4.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB28_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_multiple_zext_users:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB28_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    ldr q4, [x2, x8]
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v0.4s, v4.16b, v2.16b
-; CHECK-I8MM-NEXT:    usdot v1.4s, v4.16b, v3.16b
-; CHECK-I8MM-NEXT:    cmp x8, #1024
-; CHECK-I8MM-NEXT:    b.ne .LBB28_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_multiple_zext_users:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB28_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    ldr q4, [x2, x8]
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    sshll v5.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll v6.8h, v4.8b, #0
+; CHECK-DOT-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v4.8h, v4.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #1024
+; CHECK-DOT-NEXT:    smlal v0.4s, v5.4h, v6.4h
+; CHECK-DOT-NEXT:    smlal v1.4s, v7.4h, v6.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v4.4h
+; CHECK-DOT-NEXT:    smlal v1.4s, v3.4h, v4.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
+; CHECK-DOT-NEXT:    b.ne .LBB28_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_multiple_zext_users:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB28_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q4, [x2, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v4.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v1.4s, v4.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #1024
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB28_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_multiple_zext_users:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB28_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q4, [x2, x8]
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v4.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    usdot v1.4s, v4.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #1024
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB28_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -1100,15 +1219,15 @@ end:
 }
 
 define <2 x i64> @udot_16to64(<2 x i64> %acc, <8 x i16> %input){
-; CHECK-LABEL: udot_16to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    uaddw v0.2d, v0.2d, v2.2s
-; CHECK-NEXT:    uaddw2 v0.2d, v0.2d, v2.4s
-; CHECK-NEXT:    uaddw v0.2d, v0.2d, v1.2s
-; CHECK-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: udot_16to64:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    uaddw v0.2d, v0.2d, v2.2s
+; CHECK-COMMON-NEXT:    uaddw2 v0.2d, v0.2d, v2.4s
+; CHECK-COMMON-NEXT:    uaddw v0.2d, v0.2d, v1.2s
+; CHECK-COMMON-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
     %input.wide = zext <8 x i16> %input to <8 x i64>
     %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %input.wide)

diff  --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
index af813ff16a202..33d5ac4cd299e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,NEON
-; RUN: llc -mattr=+sve,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,SVE
-; RUN: llc -mattr=+sme,+i8mm -aarch64-enable-partial-reduce-nodes=true -force-streaming < %s | FileCheck %s --check-prefix=SME
+; RUN: llc -mattr=+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=COMMON,NEON
+; RUN: llc -mattr=+sve,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=COMMON,SVE
+; RUN: llc -mattr=+sme,+i8mm -force-streaming < %s | FileCheck %s --check-prefix=SME
 
 target triple = "aarch64"
 

diff  --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 221a15e5c8fe6..b2cde51e99619 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -1,20 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-I8MM
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM
-; RUN: llc -mtriple=aarch64 -mattr=+sve,+i8mm -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING,CHECK-NEWLOWERING-SVE
-; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING,CHECK-NEWLOWERING-SVE2
-; RUN: llc -mtriple=aarch64 -mattr=+sve,+sme,+i8mm -force-streaming -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING,CHECK-NEWLOWERING-SME
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefix=CHECK-SVE2
+; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefix=CHECK-SVE2-I8MM
+; RUN: llc -mtriple=aarch64 -mattr=+sve2,+sme,+i8mm -force-streaming %s -o - | FileCheck %s --check-prefix=CHECK-SME
 
 define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: udot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -24,15 +27,20 @@ entry:
 }
 
 define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: udot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: udot_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: udot_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -42,15 +50,20 @@ entry:
 }
 
 define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: sdot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -60,15 +73,20 @@ entry:
 }
 
 define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: sdot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -78,36 +96,36 @@ entry:
 }
 
 define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-I8MM-LABEL: usdot:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    usdot z0.s, z1.b, z2.b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: usdot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpklo z3.h, z1.b
+; CHECK-SVE2-NEXT:    sunpklo z4.h, z2.b
+; CHECK-SVE2-NEXT:    ptrue p0.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    sunpkhi z2.h, z2.b
+; CHECK-SVE2-NEXT:    uunpklo z5.s, z3.h
+; CHECK-SVE2-NEXT:    sunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    uunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    sunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    uunpklo z5.s, z1.h
+; CHECK-SVE2-NEXT:    sunpklo z6.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z3.s, z4.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z1.s, z2.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: usdot:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NOI8MM-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    uunpklo z5.s, z1.h
-; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: usdot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    usdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: usdot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    usdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: usdot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    usdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -117,36 +135,36 @@ entry:
 }
 
 define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-I8MM-LABEL: sudot:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    usdot z0.s, z2.b, z1.b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: sudot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpklo z3.h, z1.b
+; CHECK-SVE2-NEXT:    uunpklo z4.h, z2.b
+; CHECK-SVE2-NEXT:    ptrue p0.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    uunpkhi z2.h, z2.b
+; CHECK-SVE2-NEXT:    sunpklo z5.s, z3.h
+; CHECK-SVE2-NEXT:    uunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    sunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    uunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    sunpklo z5.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z6.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z3.s, z4.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z1.s, z2.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: sudot:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NOI8MM-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    sunpklo z5.s, z1.h
-; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sudot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    usdot z0.s, z2.b, z1.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sudot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    usdot z0.s, z2.b, z1.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sudot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    usdot z0.s, z2.b, z1.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -156,41 +174,29 @@ entry:
 }
 
 define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: udot_8to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SVE2-I8MM-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: udot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -201,41 +207,29 @@ entry:
 }
 
 define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
-; CHECK-LABEL: sdot_8to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SME-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: sdot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -246,82 +240,62 @@ entry:
 }
 
 define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
-; CHECK-I8MM-LABEL: usdot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-I8MM-NEXT:    sunpklo z2.d, z4.s
-; CHECK-I8MM-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-I8MM-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-I8MM-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-I8MM-NEXT:    ret
-;
-; CHECK-NOI8MM-LABEL: usdot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpklo z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.h, z3.b
-; CHECK-NOI8MM-NEXT:    sunpklo z3.h, z3.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.d
-; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpklo z7.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpklo z24.s, z5.h
-; CHECK-NOI8MM-NEXT:    sunpklo z25.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpklo z26.d, z6.s
-; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z7.s
-; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z24.s
-; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z25.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z6.d, z6.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z24.d, z24.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z25.d, z25.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    uunpklo z26.d, z4.s
-; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z5.s
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z2.s
-; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z3.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z4.d, z5.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z3.d
-; CHECK-NOI8MM-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-LABEL: usdot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpkhi z4.h, z2.b
+; CHECK-SVE2-NEXT:    uunpklo z2.h, z2.b
+; CHECK-SVE2-NEXT:    sunpkhi z5.h, z3.b
+; CHECK-SVE2-NEXT:    sunpklo z3.h, z3.b
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    uunpklo z7.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z24.s, z5.h
+; CHECK-SVE2-NEXT:    sunpklo z25.s, z3.h
+; CHECK-SVE2-NEXT:    uunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z5.s, z5.h
+; CHECK-SVE2-NEXT:    sunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    uunpklo z26.d, z6.s
+; CHECK-SVE2-NEXT:    uunpklo z27.d, z7.s
+; CHECK-SVE2-NEXT:    sunpklo z28.d, z24.s
+; CHECK-SVE2-NEXT:    sunpklo z29.d, z25.s
+; CHECK-SVE2-NEXT:    uunpkhi z6.d, z6.s
+; CHECK-SVE2-NEXT:    uunpkhi z7.d, z7.s
+; CHECK-SVE2-NEXT:    sunpkhi z24.d, z24.s
+; CHECK-SVE2-NEXT:    sunpkhi z25.d, z25.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    uunpklo z26.d, z4.s
+; CHECK-SVE2-NEXT:    sunpklo z28.d, z5.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    uunpklo z27.d, z2.s
+; CHECK-SVE2-NEXT:    sunpklo z29.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z5.d, z5.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z2.d, z3.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: usdot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    usdot z4.s, z2.b, z3.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SME-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: usdot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    usdot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -332,82 +306,62 @@ entry:
 }
 
 define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-I8MM-LABEL: sudot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-I8MM-NEXT:    sunpklo z2.d, z4.s
-; CHECK-I8MM-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-I8MM-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-I8MM-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: sudot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpkhi z4.h, z2.b
+; CHECK-SVE2-NEXT:    sunpklo z2.h, z2.b
+; CHECK-SVE2-NEXT:    uunpkhi z5.h, z3.b
+; CHECK-SVE2-NEXT:    uunpklo z3.h, z3.b
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    sunpklo z7.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z24.s, z5.h
+; CHECK-SVE2-NEXT:    uunpklo z25.s, z3.h
+; CHECK-SVE2-NEXT:    sunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z5.s, z5.h
+; CHECK-SVE2-NEXT:    uunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    sunpklo z26.d, z6.s
+; CHECK-SVE2-NEXT:    sunpklo z27.d, z7.s
+; CHECK-SVE2-NEXT:    uunpklo z28.d, z24.s
+; CHECK-SVE2-NEXT:    uunpklo z29.d, z25.s
+; CHECK-SVE2-NEXT:    sunpkhi z6.d, z6.s
+; CHECK-SVE2-NEXT:    sunpkhi z7.d, z7.s
+; CHECK-SVE2-NEXT:    uunpkhi z24.d, z24.s
+; CHECK-SVE2-NEXT:    uunpkhi z25.d, z25.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    sunpklo z26.d, z4.s
+; CHECK-SVE2-NEXT:    uunpklo z28.d, z5.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    sunpklo z27.d, z2.s
+; CHECK-SVE2-NEXT:    uunpklo z29.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z5.d, z5.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z2.d, z3.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: sudot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpklo z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.h, z3.b
-; CHECK-NOI8MM-NEXT:    uunpklo z3.h, z3.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.d
-; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpklo z7.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpklo z24.s, z5.h
-; CHECK-NOI8MM-NEXT:    uunpklo z25.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpklo z26.d, z6.s
-; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z7.s
-; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z24.s
-; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z25.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z6.d, z6.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z7.d, z7.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z24.d, z24.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z25.d, z25.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    sunpklo z26.d, z4.s
-; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z5.s
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z2.s
-; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z3.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z4.d, z5.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z3.d
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sudot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    usdot z4.s, z3.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: sudot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    usdot z4.s, z3.b, z2.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -418,51 +372,69 @@ entry:
 }
 
 define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: udot_no_bin_op:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_no_bin_op:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_no_bin_op:
-; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_no_bin_op:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
 define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: sdot_no_bin_op:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_no_bin_op:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op:
-; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_no_bin_op:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
 define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
-; CHECK-LABEL: udot_no_bin_op_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_no_bin_op_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_no_bin_op_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_no_bin_op_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SME-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
@@ -470,17 +442,23 @@ entry:
 }
 
 define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
-; CHECK-LABEL: sdot_no_bin_op_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_no_bin_op_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_no_bin_op_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SME-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
@@ -488,137 +466,93 @@ entry:
 }
 
 define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: udot_no_bin_op_8to64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEXT:    udot z3.s, z2.b, z4.b
-; CHECK-NEXT:    sunpklo z2.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_no_bin_op_8to64:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    udot z3.s, z2.b, z4.b
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0:
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE-NEXT:    udot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    udot z3.s, z2.b, z4.b
+; CHECK-SVE2-I8MM-NEXT:    uaddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uaddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0:
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE2-NEXT:    udot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0:
-; CHECK-NEWLOWERING-SME-NEXT:    mov z3.b, #1 // =0x1
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: udot_no_bin_op_8to64:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
   ret <vscale x 4 x i64> %partial.reduce
 }
 
 define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: sdot_no_bin_op_8to64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEXT:    sdot z3.s, z2.b, z4.b
-; CHECK-NEXT:    sunpklo z2.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0:
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE-NEXT:    sdot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_no_bin_op_8to64:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    sdot z3.s, z2.b, z4.b
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0:
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE2-NEXT:    sdot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    sdot z3.s, z2.b, z4.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SME-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0:
-; CHECK-NEWLOWERING-SME-NEXT:    mov z3.b, #1 // =0x1
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: sdot_no_bin_op_8to64:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
   ret <vscale x 4 x i64> %partial.reduce
 }
 
 define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
-; CHECK-LABEL: not_udot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_udot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-NEXT:    umlalb z0.s, z1.h, z2.h
+; CHECK-SVE2-NEXT:    umlalt z0.s, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: not_udot:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-SVE-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-SVE-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_udot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    umlalb z0.s, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    umlalt z0.s, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: not_udot:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalb z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalt z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: not_udot:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-SME-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-SME-NEXT:    umlalb z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SME-NEXT:    umlalt z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: not_udot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SME-NEXT:    umlalb z0.s, z1.h, z2.h
+; CHECK-SME-NEXT:    umlalt z0.s, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
@@ -628,47 +562,29 @@ entry:
 }
 
 define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
-; CHECK-LABEL: not_udot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE-LABEL: not_udot_wide:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEWLOWERING-SVE-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-LABEL: not_udot_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SVE2-NEXT:    umlalb z0.d, z1.s, z2.s
+; CHECK-SVE2-NEXT:    umlalt z0.d, z1.s, z2.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: not_udot_wide:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalb z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalt z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_udot_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-SVE2-I8MM-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SVE2-I8MM-NEXT:    umlalb z0.d, z1.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    umlalt z0.d, z1.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SME-LABEL: not_udot_wide:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEWLOWERING-SME-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SME-NEXT:    umlalb z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SME-NEXT:    umlalt z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: not_udot_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-SME-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SME-NEXT:    umlalb z0.d, z1.s, z2.s
+; CHECK-SME-NEXT:    umlalt z0.d, z1.s, z2.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
   %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>
@@ -678,47 +594,68 @@ entry:
 }
 
 define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: not_usdot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_usdot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: not_usdot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_usdot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: not_usdot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -728,47 +665,68 @@ entry:
 }
 
 define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: not_sudot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_sudot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: not_sudot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: not_sudot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: not_sudot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -778,49 +736,71 @@ entry:
 }
 
 define <vscale x 2 x i64> @udot_
diff erent_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: udot_
diff erent_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_
diff erent_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_
diff erent_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_
diff erent_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_
diff erent_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -830,51 +810,74 @@ entry:
 }
 
 define <vscale x 2 x i64> @sdot_
diff erent_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: sdot_
diff erent_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_
diff erent_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.h
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_
diff erent_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_
diff erent_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_
diff erent_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    ptrue p0.h
+; CHECK-SME-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -884,51 +887,74 @@ entry:
 }
 
 define <vscale x 2 x i64> @usdot_
diff erent_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: usdot_
diff erent_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: usdot_
diff erent_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.h
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: usdot_
diff erent_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: usdot_
diff erent_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: usdot_
diff erent_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    ptrue p0.h
+; CHECK-SME-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -938,49 +964,71 @@ entry:
 }
 
 define <vscale x 2 x i64> @sudot_
diff erent_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: sudot_
diff erent_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sudot_
diff erent_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sudot_
diff erent_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sudot_
diff erent_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sudot_
diff erent_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -990,29 +1038,26 @@ entry:
 }
 
 define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: udot_nxv8i8_promote:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    mul z1.h, z1.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEXT:    add z2.d, z2.d, z4.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_nxv8i8_promote:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_nxv8i8_promote:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_nxv8i8_promote:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SME-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -1022,31 +1067,29 @@ entry:
 }
 
 define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: sdot_nxv8i8_promote:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sxtb z1.h, p0/m, z1.h
-; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    mul z1.h, z1.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEXT:    add z2.d, z2.d, z4.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_nxv8i8_promote:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.h
+; CHECK-SVE2-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-SVE2-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_nxv8i8_promote:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z1.h, p0/m, z1.h
-; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_nxv8i8_promote:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    ptrue p0.h
+; CHECK-SME-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SME-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-SME-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -1056,35 +1099,26 @@ entry:
 }
 
 define <vscale x 4 x i64> @partial_reduce_only_split_acc(<vscale x 4 x i64> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
-; CHECK-LABEL: partial_reduce_only_split_acc:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    and z3.h, z3.h, #0xff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z4.s, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpkhi z5.s, z3.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpklo z7.d, z2.s
-; CHECK-NEXT:    uunpklo z24.d, z5.s
-; CHECK-NEXT:    uunpklo z25.d, z3.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NEXT:    mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NEXT:    mla z1.d, p0/m, z4.d, z5.d
-; CHECK-NEXT:    mla z0.d, p0/m, z2.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: partial_reduce_only_split_acc:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: partial_reduce_only_split_acc:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z3.h, z3.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z2.h, z3.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: partial_reduce_only_split_acc:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: partial_reduce_only_split_acc:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -1095,25 +1129,23 @@ entry:
 }
 
 define <vscale x 4 x i32> @sdot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: sdot_imm:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sub z0.s, z0.s, z3.s
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    sub z0.s, z0.s, z2.s
-; CHECK-NEXT:    sub z0.s, z0.s, z3.s
-; CHECK-NEXT:    sub z0.s, z0.s, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_imm:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_imm:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_imm:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
-; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_imm:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 -1)
@@ -1122,41 +1154,59 @@ entry:
 }
 
 define <vscale x 4 x i32> @sdot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: sdot_imm_does_not_fit:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z4.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_imm_does_not_fit:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SVE2-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_imm_does_not_fit:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_imm_does_not_fit:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-I8MM-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot_imm_does_not_fit:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SME-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SME-NEXT:    sunpklo z3.s, z2.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z4.s, z1.h
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SME-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SME-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SME-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SME-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SME-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256)
@@ -1165,27 +1215,23 @@ entry:
 }
 
 define <vscale x 4 x i32> @udot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: udot_imm:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    mov z2.s, #255 // =0xff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEXT:    uunpklo z4.s, z3.h
-; CHECK-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NEXT:    mla z0.s, p0/m, z4.s, z2.s
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mla z0.s, p0/m, z3.s, z2.s
-; CHECK-NEXT:    mla z0.s, p0/m, z4.s, z2.s
-; CHECK-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_imm:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: udot_imm:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_imm:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
-; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: udot_imm:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 255)
@@ -1194,41 +1240,59 @@ entry:
 }
 
 define <vscale x 4 x i32> @udot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: udot_imm_does_not_fit:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_imm_does_not_fit:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SVE2-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: udot_imm_does_not_fit:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-I8MM-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_imm_does_not_fit:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: udot_imm_does_not_fit:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SME-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SME-NEXT:    uunpklo z3.s, z2.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z1.h
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SME-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SME-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SME-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SME-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SME-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256)

diff  --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
index 428dd4c3a0154..e62979d077fd2 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
@@ -1,16 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK-SVE2
-; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefixes=CHECK-SVE
-; RUN: llc -mtriple=aarch64 -mattr=+sve -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING-SVE
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING-SVE2
+; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefix=CHECK-SVE2
 
 define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv4i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z2.d, z1.s
@@ -19,19 +11,11 @@ define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z1.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 4 x i32> %input to <vscale x 4 x i64>
     %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
@@ -39,12 +23,6 @@ entry:
 }
 
 define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv4i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z2.d, z1.s
@@ -53,19 +31,11 @@ define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z1.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 4 x i32> %input to <vscale x 4 x i64>
     %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
@@ -73,12 +43,6 @@ entry:
 }
 
 define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv8i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv8i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z2.s, z1.h
@@ -87,19 +51,11 @@ define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv8i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i16> %input to <vscale x 8 x i32>
     %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
@@ -107,12 +63,6 @@ entry:
 }
 
 define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z2.s, z1.h
@@ -121,19 +71,11 @@ define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i16> %input to <vscale x 8 x i32>
     %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
@@ -141,12 +83,6 @@ entry:
 }
 
 define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv16i8:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv16i8:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z2.h, z1.b
@@ -155,19 +91,11 @@ define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv16i8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 16 x i8> %input to <vscale x 16 x i16>
     %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
@@ -175,12 +103,6 @@ entry:
 }
 
 define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv16i8:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z2.h, z1.b
@@ -189,19 +111,11 @@ define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv16i8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 16 x i8> %input to <vscale x 16 x i16>
     %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
@@ -209,16 +123,6 @@ entry:
 }
 
 define <vscale x 2 x i32> @signed_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vscale x 4 x i16> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv4i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    ptrue p0.s
-; CHECK-SVE2-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-SVE2-NEXT:    uunpklo z2.d, z1.s
-; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv4i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    ptrue p0.s
@@ -229,23 +133,13 @@ define <vscale x 2 x i32> @signed_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv4i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.s
+; CHECK-SVE2-NEXT:    sxth z1.s, p0/m, z1.s
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 4 x i16> %input to <vscale x 4 x i32>
     %partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
@@ -253,15 +147,6 @@ entry:
 }
 
 define <vscale x 2 x i32> @unsigned_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vscale x 4 x i16> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-SVE2-NEXT:    uunpklo z2.d, z1.s
-; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv4i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    and z1.s, z1.s, #0xffff
@@ -271,21 +156,12 @@ define <vscale x 2 x i32> @unsigned_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 4 x i16> %input to <vscale x 4 x i32>
     %partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
@@ -293,18 +169,6 @@ entry:
 }
 
 define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv8i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    sunpklo z4.d, z3.s
-; CHECK-SVE2-NEXT:    sunpklo z5.d, z2.s
-; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv8i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z4.d, z3.s
@@ -317,25 +181,13 @@ define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z1.d, z1.d, z3.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z4.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z5.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv8i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    saddwt z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i32> %input to <vscale x 8 x i64>
     %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
@@ -343,18 +195,6 @@ entry:
 }
 
 define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uunpklo z4.d, z3.s
-; CHECK-SVE2-NEXT:    uunpklo z5.d, z2.s
-; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z4.d, z3.s
@@ -367,25 +207,13 @@ define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <v
 ; CHECK-SVE-NEXT:    add z1.d, z1.d, z3.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z4.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z5.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    uaddwt z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i32> %input to <vscale x 8 x i64>
     %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)


        


More information about the llvm-commits mailing list