[llvm] [AArch64][GlobalISel] Improve MULL generation (PR #112405)

David Green via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 17 15:12:08 PST 2025


https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/112405

>From 3d7034c551954358730e142433e13fb4db44463d Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 17 Feb 2025 23:11:59 +0000
Subject: [PATCH] [AArch64][GlobalISel] Improve MULL generation

This splits the existing post-legalize lowering of vector umull/smull into two
parts - one to perform the optimization of mul(ext,ext) -> mull and one to
perform the v2i64 mul scalarization. The mull part is moved to post legalizer
combine and has been taught a few extra tricks from SDAG, using known bits to
convert mul(sext, zext) or mul(zext, zero-upper-bits) into umull. This can be
important to prevent v2i64 scalarization of muls.
---
 llvm/lib/Target/AArch64/AArch64Combine.td     |  22 +-
 .../GISel/AArch64PostLegalizerCombiner.cpp    | 117 +++
 .../GISel/AArch64PostLegalizerLowering.cpp    |  63 +-
 llvm/test/CodeGen/AArch64/aarch64-smull.ll    | 408 +++-------
 llvm/test/CodeGen/AArch64/neon-dotreduce.ll   | 700 ++++++------------
 llvm/test/CodeGen/AArch64/neon-extmul.ll      | 118 +--
 6 files changed, 482 insertions(+), 946 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index ce1980697abbb..3bce9e2b13472 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -217,11 +217,19 @@ def mul_const : GICombineRule<
   (apply [{ applyAArch64MulConstCombine(*${root}, MRI, B, ${matchinfo}); }])
 >;
 
-def lower_mull : GICombineRule<
-  (defs root:$root),
-  (match (wip_match_opcode G_MUL):$root,
-          [{ return matchExtMulToMULL(*${root}, MRI); }]),
-  (apply [{ applyExtMulToMULL(*${root}, MRI, B, Observer); }])
+def mull_matchdata : GIDefMatchData<"std::tuple<bool, Register, Register>">;
+def extmultomull : GICombineRule<
+  (defs root:$root, mull_matchdata:$matchinfo),
+  (match (G_MUL $dst, $src1, $src2):$root,
+          [{ return matchExtMulToMULL(*${root}, MRI, KB, ${matchinfo}); }]),
+  (apply [{ applyExtMulToMULL(*${root}, MRI, B, Observer, ${matchinfo}); }])
+>;
+
+def lower_mulv2s64 : GICombineRule<
+  (defs root:$root, mull_matchdata:$matchinfo),
+  (match (G_MUL $dst, $src1, $src2):$root,
+          [{ return matchMulv2s64(*${root}, MRI); }]),
+  (apply [{ applyMulv2s64(*${root}, MRI, B, Observer); }])
 >;
 
 def build_vector_to_dup : GICombineRule<
@@ -316,7 +324,7 @@ def AArch64PostLegalizerLowering
                         icmp_lowering, build_vector_lowering,
                         lower_vector_fcmp, form_truncstore,
                         vector_sext_inreg_to_shift,
-                        unmerge_ext_to_unmerge, lower_mull,
+                        unmerge_ext_to_unmerge, lower_mulv2s64,
                         vector_unmerge_lowering, insertelt_nonconst]> {
 }
 
@@ -339,5 +347,5 @@ def AArch64PostLegalizerCombiner
                         select_to_minmax, or_to_bsp, combine_concat_vector,
                         commute_constant_to_rhs,
                         push_freeze_to_prevent_poison_from_propagating,
-                        combine_mul_cmlt, combine_use_vector_truncate]> {
+                        combine_mul_cmlt, combine_use_vector_truncate, extmultomull]> {
 }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index cf6b2ce9c5341..72014f9ba5d3f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -438,6 +438,123 @@ void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
   MI.eraseFromParent();
 }
 
+// Match mul({z/s}ext , {z/s}ext) => {u/s}mull
+bool matchExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI,
+                       GISelKnownBits *KB,
+                       std::tuple<bool, Register, Register> &MatchInfo) {
+  // Get the instructions that defined the source operand
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+  MachineInstr *I2 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+  unsigned I1Opc = I1->getOpcode();
+  unsigned I2Opc = I2->getOpcode();
+  unsigned EltSize = DstTy.getScalarSizeInBits();
+
+  if (!DstTy.isVector() || I1->getNumOperands() < 2 || I2->getNumOperands() < 2)
+    return false;
+
+  auto IsAtLeastDoubleExtend = [&](Register R) {
+    LLT Ty = MRI.getType(R);
+    return EltSize >= Ty.getScalarSizeInBits() * 2;
+  };
+
+  // If the source operands were EXTENDED before, then {U/S}MULL can be used
+  bool IsZExt1 =
+      I1Opc == TargetOpcode::G_ZEXT || I1Opc == TargetOpcode::G_ANYEXT;
+  bool IsZExt2 =
+      I2Opc == TargetOpcode::G_ZEXT || I2Opc == TargetOpcode::G_ANYEXT;
+  if (IsZExt1 && IsZExt2 && IsAtLeastDoubleExtend(I1->getOperand(1).getReg()) &&
+      IsAtLeastDoubleExtend(I2->getOperand(1).getReg())) {
+    get<0>(MatchInfo) = true;
+    get<1>(MatchInfo) = I1->getOperand(1).getReg();
+    get<2>(MatchInfo) = I2->getOperand(1).getReg();
+    return true;
+  }
+
+  bool IsSExt1 =
+      I1Opc == TargetOpcode::G_SEXT || I1Opc == TargetOpcode::G_ANYEXT;
+  bool IsSExt2 =
+      I2Opc == TargetOpcode::G_SEXT || I2Opc == TargetOpcode::G_ANYEXT;
+  if (IsSExt1 && IsSExt2 && IsAtLeastDoubleExtend(I1->getOperand(1).getReg()) &&
+      IsAtLeastDoubleExtend(I2->getOperand(1).getReg())) {
+    get<0>(MatchInfo) = false;
+    get<1>(MatchInfo) = I1->getOperand(1).getReg();
+    get<2>(MatchInfo) = I2->getOperand(1).getReg();
+    return true;
+  }
+
+  // Select UMULL if we can replace the other operand with an extend.
+  APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
+  if (KB && (IsZExt1 || IsZExt2) &&
+      IsAtLeastDoubleExtend(IsZExt1 ? I1->getOperand(1).getReg()
+                                    : I2->getOperand(1).getReg())) {
+    Register ZExtOp =
+        IsZExt1 ? MI.getOperand(2).getReg() : MI.getOperand(1).getReg();
+    if (KB->maskedValueIsZero(ZExtOp, Mask)) {
+      get<0>(MatchInfo) = true;
+      get<1>(MatchInfo) = IsZExt1 ? I1->getOperand(1).getReg() : ZExtOp;
+      get<2>(MatchInfo) = IsZExt1 ? ZExtOp : I2->getOperand(1).getReg();
+      return true;
+    }
+  } else if (KB && DstTy == LLT::fixed_vector(2, 64) &&
+             KB->maskedValueIsZero(MI.getOperand(1).getReg(), Mask) &&
+             KB->maskedValueIsZero(MI.getOperand(2).getReg(), Mask)) {
+    get<0>(MatchInfo) = true;
+    get<1>(MatchInfo) = MI.getOperand(1).getReg();
+    get<2>(MatchInfo) = MI.getOperand(2).getReg();
+    return true;
+  }
+
+  if (KB && (IsSExt1 || IsSExt2) &&
+      IsAtLeastDoubleExtend(IsSExt1 ? I1->getOperand(1).getReg()
+                                    : I2->getOperand(1).getReg())) {
+    Register SExtOp =
+        IsSExt1 ? MI.getOperand(2).getReg() : MI.getOperand(1).getReg();
+    if (KB->computeNumSignBits(SExtOp) > EltSize / 2) {
+      get<0>(MatchInfo) = false;
+      get<1>(MatchInfo) = IsSExt1 ? I1->getOperand(1).getReg() : SExtOp;
+      get<2>(MatchInfo) = IsSExt1 ? SExtOp : I2->getOperand(1).getReg();
+      return true;
+    }
+  } else if (KB && DstTy == LLT::fixed_vector(2, 64) &&
+             KB->computeNumSignBits(MI.getOperand(1).getReg()) > EltSize / 2 &&
+             KB->computeNumSignBits(MI.getOperand(2).getReg()) > EltSize / 2) {
+    get<0>(MatchInfo) = false;
+    get<1>(MatchInfo) = MI.getOperand(1).getReg();
+    get<2>(MatchInfo) = MI.getOperand(2).getReg();
+    return true;
+  }
+
+  return false;
+}
+
+void applyExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI,
+                       MachineIRBuilder &B, GISelChangeObserver &Observer,
+                       std::tuple<bool, Register, Register> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_MUL &&
+         "Expected a G_MUL instruction");
+
+  // Get the instructions that defined the source operand
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  bool IsZExt = get<0>(MatchInfo);
+  Register Src1Reg = get<1>(MatchInfo);
+  Register Src2Reg = get<2>(MatchInfo);
+  LLT Src1Ty = MRI.getType(Src1Reg);
+  LLT Src2Ty = MRI.getType(Src2Reg);
+  LLT HalfDstTy = DstTy.changeElementSize(DstTy.getScalarSizeInBits() / 2);
+  unsigned ExtOpc = IsZExt ? TargetOpcode::G_ZEXT : TargetOpcode::G_SEXT;
+
+  if (Src1Ty.getScalarSizeInBits() * 2 != DstTy.getScalarSizeInBits())
+    Src1Reg = B.buildExtOrTrunc(ExtOpc, {HalfDstTy}, {Src1Reg}).getReg(0);
+  if (Src2Ty.getScalarSizeInBits() * 2 != DstTy.getScalarSizeInBits())
+    Src2Reg = B.buildExtOrTrunc(ExtOpc, {HalfDstTy}, {Src2Reg}).getReg(0);
+
+  B.setInstrAndDebugLoc(MI);
+  B.buildInstr(IsZExt ? AArch64::G_UMULL : AArch64::G_SMULL,
+               {MI.getOperand(0).getReg()}, {Src1Reg, Src2Reg});
+  MI.eraseFromParent();
+}
+
 class AArch64PostLegalizerCombinerImpl : public Combiner {
 protected:
   const CombinerHelper Helper;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 6bba70d45a61d..7e3d8cb5893da 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -1190,68 +1190,25 @@ void applyUnmergeExtToUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI,
 // Doing these two matches in one function to ensure that the order of matching
 // will always be the same.
 // Try lowering MUL to MULL before trying to scalarize if needed.
-bool matchExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI) {
+bool matchMulv2s64(MachineInstr &MI, MachineRegisterInfo &MRI) {
   // Get the instructions that defined the source operand
   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
-  MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
-  MachineInstr *I2 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
-
-  if (DstTy.isVector()) {
-    // If the source operands were EXTENDED before, then {U/S}MULL can be used
-    unsigned I1Opc = I1->getOpcode();
-    unsigned I2Opc = I2->getOpcode();
-    if (((I1Opc == TargetOpcode::G_ZEXT && I2Opc == TargetOpcode::G_ZEXT) ||
-         (I1Opc == TargetOpcode::G_SEXT && I2Opc == TargetOpcode::G_SEXT)) &&
-        (MRI.getType(I1->getOperand(0).getReg()).getScalarSizeInBits() ==
-         MRI.getType(I1->getOperand(1).getReg()).getScalarSizeInBits() * 2) &&
-        (MRI.getType(I2->getOperand(0).getReg()).getScalarSizeInBits() ==
-         MRI.getType(I2->getOperand(1).getReg()).getScalarSizeInBits() * 2)) {
-      return true;
-    }
-    // If result type is v2s64, scalarise the instruction
-    else if (DstTy == LLT::fixed_vector(2, 64)) {
-      return true;
-    }
-  }
-  return false;
+  return DstTy == LLT::fixed_vector(2, 64);
 }
 
-void applyExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI,
-                       MachineIRBuilder &B, GISelChangeObserver &Observer) {
+void applyMulv2s64(MachineInstr &MI, MachineRegisterInfo &MRI,
+                   MachineIRBuilder &B, GISelChangeObserver &Observer) {
   assert(MI.getOpcode() == TargetOpcode::G_MUL &&
          "Expected a G_MUL instruction");
 
   // Get the instructions that defined the source operand
   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
-  MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
-  MachineInstr *I2 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
-
-  // If the source operands were EXTENDED before, then {U/S}MULL can be used
-  unsigned I1Opc = I1->getOpcode();
-  unsigned I2Opc = I2->getOpcode();
-  if (((I1Opc == TargetOpcode::G_ZEXT && I2Opc == TargetOpcode::G_ZEXT) ||
-       (I1Opc == TargetOpcode::G_SEXT && I2Opc == TargetOpcode::G_SEXT)) &&
-      (MRI.getType(I1->getOperand(0).getReg()).getScalarSizeInBits() ==
-       MRI.getType(I1->getOperand(1).getReg()).getScalarSizeInBits() * 2) &&
-      (MRI.getType(I2->getOperand(0).getReg()).getScalarSizeInBits() ==
-       MRI.getType(I2->getOperand(1).getReg()).getScalarSizeInBits() * 2)) {
-
-    B.setInstrAndDebugLoc(MI);
-    B.buildInstr(I1->getOpcode() == TargetOpcode::G_ZEXT ? AArch64::G_UMULL
-                                                         : AArch64::G_SMULL,
-                 {MI.getOperand(0).getReg()},
-                 {I1->getOperand(1).getReg(), I2->getOperand(1).getReg()});
-    MI.eraseFromParent();
-  }
-  // If result type is v2s64, scalarise the instruction
-  else if (DstTy == LLT::fixed_vector(2, 64)) {
-    LegalizerHelper Helper(*MI.getMF(), Observer, B);
-    B.setInstrAndDebugLoc(MI);
-    Helper.fewerElementsVector(
-        MI, 0,
-        DstTy.changeElementCount(
-            DstTy.getElementCount().divideCoefficientBy(2)));
-  }
+  assert(DstTy == LLT::fixed_vector(2, 64) && "Expected v2s64 Mul");
+  LegalizerHelper Helper(*MI.getMF(), Observer, B);
+  B.setInstrAndDebugLoc(MI);
+  Helper.fewerElementsVector(
+      MI, 0,
+      DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2)));
 }
 
 class AArch64PostLegalizerLoweringImpl : public Combiner {
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 3b589d3480179..714be46a015f4 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -73,14 +73,10 @@ define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind {
 ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr d0, [x0]
-; CHECK-GI-NEXT:    ldr q1, [x1]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v0.4s
-; CHECK-GI-NEXT:    mul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    smull v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smull2 v1.4s, v1.8h, v2.8h
 ; CHECK-GI-NEXT:    ret
   %load.A = load <8 x i8>, ptr %A
   %load.B = load <8 x i16>, ptr %B
@@ -112,14 +108,10 @@ define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounw
 ; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    ldr d0, [x1]
-; CHECK-GI-NEXT:    ldr q1, [x0]
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v3.4s
-; CHECK-GI-NEXT:    mul v1.4s, v1.4s, v4.4s
+; CHECK-GI-NEXT:    ldr q2, [x0]
+; CHECK-GI-NEXT:    ushll v1.8h, v0.8b, #0
+; CHECK-GI-NEXT:    smull v0.4s, v2.4h, v1.4h
+; CHECK-GI-NEXT:    smull2 v1.4s, v2.8h, v1.8h
 ; CHECK-GI-NEXT:    ret
   %load.A = load <8 x i16>, ptr %A
   %load.B = load <8 x i8>, ptr %B
@@ -258,20 +250,10 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-GI-NEXT:    movi d0, #0x00ffff0000ffff
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
-; CHECK-GI-NEXT:    ldr d1, [x1]
-; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT:    mov w8, v0.s[0]
-; CHECK-GI-NEXT:    mov w9, v0.s[1]
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
+; CHECK-GI-NEXT:    ldr d0, [x1]
+; CHECK-GI-NEXT:    smull v0.2d, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    ret
   %load.A = load <2 x i16>, ptr %A
   %load.B = load <2 x i32>, ptr %B
@@ -304,16 +286,7 @@ define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-GI-NEXT:    ldr d1, [x0]
 ; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    ldr d1, [x1]
-; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    ret
   %load.A = load <2 x i32>, ptr %A
   %and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF>
@@ -935,24 +908,11 @@ define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 
 ; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
 define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
-; CHECK-NEON-LABEL: smull_extvec_v8i8_v8i16:
-; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    movi v1.8b, #244
-; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v1.8b
-; CHECK-NEON-NEXT:    ret
-;
-; CHECK-SVE-LABEL: smull_extvec_v8i8_v8i16:
-; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    movi v1.8b, #244
-; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v1.8b
-; CHECK-SVE-NEXT:    ret
-;
-; CHECK-GI-LABEL: smull_extvec_v8i8_v8i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mvni v1.8h, #11
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.8b, #244
+; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %tmp3 = sext <8 x i8> %arg to <8 x i16>
   %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
   ret <8 x i16> %tmp4
@@ -989,24 +949,11 @@ define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
 }
 
 define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
-; CHECK-NEON-LABEL: smull_extvec_v4i16_v4i32:
-; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    mvni v1.4h, #11
-; CHECK-NEON-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-NEON-NEXT:    ret
-;
-; CHECK-SVE-LABEL: smull_extvec_v4i16_v4i32:
-; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    mvni v1.4h, #11
-; CHECK-SVE-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-SVE-NEXT:    ret
-;
-; CHECK-GI-LABEL: smull_extvec_v4i16_v4i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mvni v1.4s, #11
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mvni v1.4h, #11
+; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %tmp3 = sext <4 x i16> %arg to <4 x i32>
   %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
   ret <4 x i32> %tmp4
@@ -1030,16 +977,8 @@ define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
 ; CHECK-GI-LABEL: smull_extvec_v2i32_v2i64:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI36_0
-; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI36_0]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI36_0]
+; CHECK-GI-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    ret
   %tmp3 = sext <2 x i32> %arg to <2 x i64>
   %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
@@ -1047,24 +986,11 @@ define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
 }
 
 define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
-; CHECK-NEON-LABEL: umull_extvec_v8i8_v8i16:
-; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    movi v1.8b, #12
-; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEON-NEXT:    ret
-;
-; CHECK-SVE-LABEL: umull_extvec_v8i8_v8i16:
-; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    movi v1.8b, #12
-; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-SVE-NEXT:    ret
-;
-; CHECK-GI-LABEL: umull_extvec_v8i8_v8i16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v1.8h, #12
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.8b, #12
+; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %tmp3 = zext <8 x i8> %arg to <8 x i16>
   %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
   ret <8 x i16> %tmp4
@@ -1118,9 +1044,8 @@ define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
 ; CHECK-GI-LABEL: umull_extvec_v4i16_v4i32:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI39_0
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI39_0]
-; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI39_0]
+; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    ret
   %tmp3 = zext <4 x i16> %arg to <4 x i32>
   %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
@@ -1145,16 +1070,8 @@ define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
 ; CHECK-GI-LABEL: umull_extvec_v2i32_v2i64:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI40_0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI40_0]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI40_0]
+; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    ret
   %tmp3 = zext <2 x i32> %arg to <2 x i64>
   %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
@@ -1178,10 +1095,9 @@ define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
 ;
 ; CHECK-GI-LABEL: amull_extvec_v8i8_v8i16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v1.8h, #12
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    movi v1.8b, #12
 ; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
-; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    umull v0.8h, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
   %tmp3 = zext <8 x i8> %arg to <8 x i16>
@@ -1212,10 +1128,9 @@ define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
 ; CHECK-GI-LABEL: amull_extvec_v4i16_v4i32:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI42_0
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    movi v2.2d, #0x00ffff0000ffff
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI42_0]
-; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI42_0]
+; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
   %tmp3 = zext <4 x i16> %arg to <4 x i32>
@@ -1246,18 +1161,10 @@ define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
 ; CHECK-GI-LABEL: amull_extvec_v2i32_v2i64:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    adrp x8, .LCPI43_0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI43_0]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
-; CHECK-GI-NEXT:    movi v1.2d, #0x000000ffffffff
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
-; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    movi v2.2d, #0x000000ffffffff
+; CHECK-GI-NEXT:    ldr d1, [x8, :lo12:.LCPI43_0]
+; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-GI-NEXT:    ret
   %tmp3 = zext <2 x i32> %arg to <2 x i64>
   %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
@@ -1635,9 +1542,9 @@ define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
 ; CHECK-GI-LABEL: umull_and_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
+; CHECK-GI-NEXT:    umull v0.8h, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <8 x i8> %src1 to <8 x i16>
@@ -1664,9 +1571,9 @@ define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) {
 ; CHECK-GI-LABEL: umull_and_v8i16_c:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    movi v2.2d, #0xff00ff00ff00ff
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    mul v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
+; CHECK-GI-NEXT:    umull v0.8h, v1.8b, v0.8b
 ; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <8 x i8> %src1 to <8 x i16>
@@ -1705,9 +1612,8 @@ define <8 x i16> @umull_andconst_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
 ;
 ; CHECK-GI-LABEL: umull_andconst_v8i16:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v1.2d, #0xff00ff00ff00ff
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    movi d1, #0xffffffffffffffff
+; CHECK-GI-NEXT:    umull v0.8h, v0.8b, v1.8b
 ; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <8 x i8> %src1 to <8 x i16>
@@ -1751,29 +1657,13 @@ entry:
 }
 
 define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) {
-; CHECK-NEON-LABEL: umull_and_v4i32:
-; CHECK-NEON:       // %bb.0: // %entry
-; CHECK-NEON-NEXT:    movi v2.2d, #0x0000ff000000ff
-; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEON-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEON-NEXT:    ret
-;
-; CHECK-SVE-LABEL: umull_and_v4i32:
-; CHECK-SVE:       // %bb.0: // %entry
-; CHECK-SVE-NEXT:    movi v2.2d, #0x0000ff000000ff
-; CHECK-SVE-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-SVE-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SVE-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-SVE-NEXT:    ret
-;
-; CHECK-GI-LABEL: umull_and_v4i32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v2.2d, #0x0000ff000000ff
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umull_and_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0x0000ff000000ff
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    xtn v1.4h, v1.4s
+; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    ret
 entry:
   %in1 = zext <4 x i16> %src1 to <4 x i32>
   %in2 = and <4 x i32> %src2, <i32 255, i32 255, i32 255, i32 255>
@@ -1805,12 +1695,13 @@ define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) {
 ; CHECK-GI-LABEL: umull_and_v8i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    movi v3.2d, #0x0000ff000000ff
-; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll2 v5.4s, v0.8h, #0
-; CHECK-GI-NEXT:    and v0.16b, v1.16b, v3.16b
-; CHECK-GI-NEXT:    and v1.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT:    mul v0.4s, v4.4s, v0.4s
-; CHECK-GI-NEXT:    mul v1.4s, v5.4s, v1.4s
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    xtn v1.4h, v1.4s
+; CHECK-GI-NEXT:    xtn v2.4h, v2.4s
+; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umull v1.4s, v3.4h, v2.4h
 ; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <8 x i16> %src1 to <8 x i32>
@@ -1855,36 +1746,13 @@ entry:
 }
 
 define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) {
-; CHECK-NEON-LABEL: umull_and_v2i64:
-; CHECK-NEON:       // %bb.0: // %entry
-; CHECK-NEON-NEXT:    movi v2.2d, #0x000000000000ff
-; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    ret
-;
-; CHECK-SVE-LABEL: umull_and_v2i64:
-; CHECK-SVE:       // %bb.0: // %entry
-; CHECK-SVE-NEXT:    movi v2.2d, #0x000000000000ff
-; CHECK-SVE-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-SVE-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-SVE-NEXT:    ret
-;
-; CHECK-GI-LABEL: umull_and_v2i64:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    movi v2.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: umull_and_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0x000000000000ff
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
 entry:
   %in1 = zext <2 x i32> %src1 to <2 x i64>
   %in2 = and <2 x i64> %src2, <i64 255, i64 255>
@@ -1916,26 +1784,13 @@ define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
 ; CHECK-GI-LABEL: umull_and_v4i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    movi v3.2d, #0x000000000000ff
-; CHECK-GI-NEXT:    ushll v4.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    fmov x8, d4
-; CHECK-GI-NEXT:    mov x10, v4.d[1]
-; CHECK-GI-NEXT:    mov x13, v0.d[1]
 ; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
 ; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    fmov x12, d2
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
-; CHECK-GI-NEXT:    mov x14, v2.d[1]
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov x9, d0
-; CHECK-GI-NEXT:    mul x10, x10, x11
-; CHECK-GI-NEXT:    mul x9, x9, x12
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mul x11, x13, x14
-; CHECK-GI-NEXT:    mov v1.d[0], x9
-; CHECK-GI-NEXT:    mov v0.d[1], x10
-; CHECK-GI-NEXT:    mov v1.d[1], x11
+; CHECK-GI-NEXT:    mov d3, v0.d[1]
+; CHECK-GI-NEXT:    xtn v1.2s, v1.2d
+; CHECK-GI-NEXT:    xtn v2.2s, v2.2d
+; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    umull v1.2d, v3.2s, v2.2s
 ; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <4 x i32> %src1 to <4 x i64>
@@ -2397,33 +2252,12 @@ define <2 x i32> @do_stuff(<2 x i64> %0, <2 x i64> %1) {
 }
 
 define <2 x i64> @lsr(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-NEON-LABEL: lsr:
-; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-NEON-NEXT:    shrn v1.2s, v1.2d, #32
-; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    ret
-;
-; CHECK-SVE-LABEL: lsr:
-; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-SVE-NEXT:    shrn v1.2s, v1.2d, #32
-; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-SVE-NEXT:    ret
-;
-; CHECK-GI-LABEL: lsr:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    ushr v0.2d, v0.2d, #32
-; CHECK-GI-NEXT:    ushr v1.2d, v1.2d, #32
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: lsr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-NEXT:    shrn v1.2s, v1.2d, #32
+; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
     %x = lshr <2 x i64> %a, <i64 32, i64 32>
     %y = lshr <2 x i64> %b, <i64 32, i64 32>
     %z = mul nsw <2 x i64> %x, %y
@@ -2431,34 +2265,12 @@ define <2 x i64> @lsr(<2 x i64> %a, <2 x i64> %b) {
 }
 
 define <2 x i64> @lsr_const(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-NEON-LABEL: lsr_const:
-; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    movi v1.2s, #31
-; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    ret
-;
-; CHECK-SVE-LABEL: lsr_const:
-; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    movi v1.2s, #31
-; CHECK-SVE-NEXT:    shrn v0.2s, v0.2d, #32
-; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-SVE-NEXT:    ret
-;
-; CHECK-GI-LABEL: lsr_const:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI79_0
-; CHECK-GI-NEXT:    ushr v0.2d, v0.2d, #32
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI79_0]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: lsr_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.2s, #31
+; CHECK-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
     %x = lshr <2 x i64> %a, <i64 32, i64 32>
     %z = mul nsw <2 x i64> %x, <i64 31, i64 31>
     ret <2 x i64> %z
@@ -2629,10 +2441,10 @@ define <8 x i16> @smulladdl_const_v8i8_v8i16(<8 x i8> %A, <8 x i8> %C) nounwind
 ;
 ; CHECK-GI-LABEL: smulladdl_const_v8i8_v8i16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v2.8h, #10
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v2.8h
-; CHECK-GI-NEXT:    saddw v0.8h, v0.8h, v1.8b
+; CHECK-GI-NEXT:    movi v2.8b, #10
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    smlal v1.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
   %tmp1 = sext <8 x i8> %A to <8 x i16>
   %tmp3 = sext <8 x i8> %C to <8 x i16>
@@ -2658,10 +2470,10 @@ define <8 x i16> @umulladdl_const_v8i8_v8i16(<8 x i8> %A, <8 x i8> %C) nounwind
 ;
 ; CHECK-GI-LABEL: umulladdl_const_v8i8_v8i16:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    movi v2.8h, #10
-; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v2.8h
-; CHECK-GI-NEXT:    uaddw v0.8h, v0.8h, v1.8b
+; CHECK-GI-NEXT:    movi v2.8b, #10
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    umlal v1.8h, v0.8b, v2.8b
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
   %tmp1 = zext <8 x i8> %A to <8 x i16>
   %tmp3 = zext <8 x i8> %C to <8 x i16>
@@ -2942,18 +2754,10 @@ define <2 x i64> @smulladdl_const_v2i32_v2i64(<2 x i32> %A, <2 x i32> %C) nounwi
 ;
 ; CHECK-GI-LABEL: smulladdl_const_v2i32_v2i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI98_0
-; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI98_0]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d2
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v2.d[1]
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
-; CHECK-GI-NEXT:    saddw v0.2d, v0.2d, v1.2s
+; CHECK-GI-NEXT:    movi v2.2s, #10
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    smlal v1.2d, v0.2s, v2.2s
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
   %tmp1 = sext <2 x i32> %A to <2 x i64>
   %tmp3 = sext <2 x i32> %C to <2 x i64>
@@ -2979,18 +2783,10 @@ define <2 x i64> @umulladdl_const_v2i32_v2i64(<2 x i32> %A, <2 x i32> %C) nounwi
 ;
 ; CHECK-GI-LABEL: umulladdl_const_v2i32_v2i64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI99_0
-; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI99_0]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d2
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v2.d[1]
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
-; CHECK-GI-NEXT:    uaddw v0.2d, v0.2d, v1.2s
+; CHECK-GI-NEXT:    movi v2.2s, #10
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    umlal v1.2d, v0.2s, v2.2s
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
   %tmp1 = zext <2 x i32> %A to <2 x i64>
   %tmp3 = zext <2 x i32> %C to <2 x i64>
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 6fb4e219d39f4..2d3fda704908e 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -812,12 +812,8 @@ define i32 @test_usdot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll2 v2.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    mul v2.4s, v3.4s, v2.4s
-; CHECK-GI-NEXT:    mla v2.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    smull v2.4s, v1.4h, v0.4h
+; CHECK-GI-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
 ; CHECK-GI-NEXT:    addv s0, v2.4s
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
@@ -848,12 +844,8 @@ define i32 @test_usdot_swapped_operands_v8i8(ptr nocapture readonly %a, ptr noca
 ; CHECK-GI-NEXT:    ldr d1, [x1]
 ; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll2 v2.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    mul v2.4s, v3.4s, v2.4s
-; CHECK-GI-NEXT:    mla v2.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    smull v2.4s, v1.4h, v0.4h
+; CHECK-GI-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
 ; CHECK-GI-NEXT:    addv s0, v2.4s
 ; CHECK-GI-NEXT:    fmov w0, s0
 ; CHECK-GI-NEXT:    ret
@@ -965,18 +957,10 @@ define i32 @test_usdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %
 ; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
 ; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT:    ushll2 v4.4s, v2.8h, #0
-; CHECK-GI-NEXT:    ushll2 v5.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll2 v6.4s, v3.8h, #0
-; CHECK-GI-NEXT:    sshll2 v7.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    mul v4.4s, v6.4s, v4.4s
-; CHECK-GI-NEXT:    mul v5.4s, v7.4s, v5.4s
-; CHECK-GI-NEXT:    mla v4.4s, v3.4s, v2.4s
-; CHECK-GI-NEXT:    mla v5.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    smull v4.4s, v3.4h, v2.4h
+; CHECK-GI-NEXT:    smull v5.4s, v1.4h, v0.4h
+; CHECK-GI-NEXT:    smlal2 v4.4s, v3.8h, v2.8h
+; CHECK-GI-NEXT:    smlal2 v5.4s, v1.8h, v0.8h
 ; CHECK-GI-NEXT:    add v0.4s, v4.4s, v5.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
@@ -1013,18 +997,10 @@ define i32 @test_usdot_swapped_operands_v16i8(ptr nocapture readonly %a, ptr noc
 ; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
 ; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT:    sshll2 v4.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll2 v5.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v6.4s, v3.8h, #0
-; CHECK-GI-NEXT:    ushll2 v7.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    mul v4.4s, v6.4s, v4.4s
-; CHECK-GI-NEXT:    mul v5.4s, v7.4s, v5.4s
-; CHECK-GI-NEXT:    mla v4.4s, v3.4s, v2.4s
-; CHECK-GI-NEXT:    mla v5.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    smull v4.4s, v3.4h, v2.4h
+; CHECK-GI-NEXT:    smull v5.4s, v1.4h, v0.4h
+; CHECK-GI-NEXT:    smlal2 v4.4s, v3.8h, v2.8h
+; CHECK-GI-NEXT:    smlal2 v5.4s, v1.8h, v0.8h
 ; CHECK-GI-NEXT:    add v0.4s, v4.4s, v5.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
@@ -1332,18 +1308,10 @@ define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i
 ; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll2 v5.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll2 v6.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll2 v7.4s, v3.8h, #0
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    mul v4.4s, v4.4s, v5.4s
-; CHECK-GI-NEXT:    mul v5.4s, v6.4s, v7.4s
-; CHECK-GI-NEXT:    mla v4.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    mla v5.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    smull v4.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    smull v5.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT:    smlal2 v4.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    smlal2 v5.4s, v2.8h, v3.8h
 ; CHECK-GI-NEXT:    addv s0, v4.4s
 ; CHECK-GI-NEXT:    addv s1, v5.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
@@ -1381,18 +1349,10 @@ define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8
 ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll2 v6.4s, v2.8h, #0
-; CHECK-GI-NEXT:    ushll2 v7.4s, v3.8h, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    mul v4.4s, v4.4s, v5.4s
-; CHECK-GI-NEXT:    mul v5.4s, v6.4s, v7.4s
-; CHECK-GI-NEXT:    mla v4.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    mla v5.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    smull v4.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    smull v5.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT:    smlal2 v4.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    smlal2 v5.4s, v2.8h, v3.8h
 ; CHECK-GI-NEXT:    addv s0, v4.4s
 ; CHECK-GI-NEXT:    addv s1, v5.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
@@ -1431,33 +1391,17 @@ define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <1
 ; CHECK-GI-NEXT:    sshll v5.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
 ; CHECK-GI-NEXT:    ushll v6.8h, v2.8b, #0
-; CHECK-GI-NEXT:    sshll v7.8h, v3.8b, #0
 ; CHECK-GI-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-GI-NEXT:    sshll v7.8h, v3.8b, #0
 ; CHECK-GI-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-GI-NEXT:    ushll2 v16.4s, v4.8h, #0
-; CHECK-GI-NEXT:    ushll2 v17.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll2 v18.4s, v5.8h, #0
-; CHECK-GI-NEXT:    sshll2 v19.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll2 v20.4s, v6.8h, #0
-; CHECK-GI-NEXT:    sshll2 v21.4s, v7.8h, #0
-; CHECK-GI-NEXT:    ushll2 v22.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll2 v23.4s, v3.8h, #0
-; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
-; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v18.4s
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v5.4s, v5.4h, #0
-; CHECK-GI-NEXT:    mul v17.4s, v17.4s, v19.4s
-; CHECK-GI-NEXT:    mul v18.4s, v20.4s, v21.4s
-; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    mul v19.4s, v22.4s, v23.4s
-; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll v7.4s, v7.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    mla v16.4s, v4.4s, v5.4s
-; CHECK-GI-NEXT:    mla v17.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    mla v18.4s, v6.4s, v7.4s
-; CHECK-GI-NEXT:    mla v19.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    smull v16.4s, v4.4h, v5.4h
+; CHECK-GI-NEXT:    smull v17.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    smull v18.4s, v6.4h, v7.4h
+; CHECK-GI-NEXT:    smull v19.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT:    smlal2 v16.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    smlal2 v17.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    smlal2 v18.4s, v6.8h, v7.8h
+; CHECK-GI-NEXT:    smlal2 v19.4s, v2.8h, v3.8h
 ; CHECK-GI-NEXT:    add v0.4s, v16.4s, v17.4s
 ; CHECK-GI-NEXT:    add v1.4s, v18.4s, v19.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
@@ -1499,33 +1443,17 @@ define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b,
 ; CHECK-GI-NEXT:    ushll v5.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
 ; CHECK-GI-NEXT:    sshll v6.8h, v2.8b, #0
-; CHECK-GI-NEXT:    ushll v7.8h, v3.8b, #0
 ; CHECK-GI-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-GI-NEXT:    ushll v7.8h, v3.8b, #0
 ; CHECK-GI-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-GI-NEXT:    sshll2 v16.4s, v4.8h, #0
-; CHECK-GI-NEXT:    sshll2 v17.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v18.4s, v5.8h, #0
-; CHECK-GI-NEXT:    ushll2 v19.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll2 v20.4s, v6.8h, #0
-; CHECK-GI-NEXT:    ushll2 v21.4s, v7.8h, #0
-; CHECK-GI-NEXT:    sshll2 v22.4s, v2.8h, #0
-; CHECK-GI-NEXT:    ushll2 v23.4s, v3.8h, #0
-; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
-; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v18.4s
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v5.4s, v5.4h, #0
-; CHECK-GI-NEXT:    mul v17.4s, v17.4s, v19.4s
-; CHECK-GI-NEXT:    mul v18.4s, v20.4s, v21.4s
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    mul v19.4s, v22.4s, v23.4s
-; CHECK-GI-NEXT:    sshll v6.4s, v6.4h, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll v7.4s, v7.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    mla v16.4s, v4.4s, v5.4s
-; CHECK-GI-NEXT:    mla v17.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    mla v18.4s, v6.4s, v7.4s
-; CHECK-GI-NEXT:    mla v19.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    smull v16.4s, v4.4h, v5.4h
+; CHECK-GI-NEXT:    smull v17.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    smull v18.4s, v6.4h, v7.4h
+; CHECK-GI-NEXT:    smull v19.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT:    smlal2 v16.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    smlal2 v17.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    smlal2 v18.4s, v6.8h, v7.8h
+; CHECK-GI-NEXT:    smlal2 v19.4s, v2.8h, v3.8h
 ; CHECK-GI-NEXT:    add v0.4s, v16.4s, v17.4s
 ; CHECK-GI-NEXT:    add v1.4s, v18.4s, v19.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
@@ -3858,30 +3786,14 @@ define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %
 ; CHECK-GI-NEXT:    ushll2 v2.8h, v2.16b, #0
 ; CHECK-GI-NEXT:    ushll v7.8h, v3.8b, #0
 ; CHECK-GI-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-GI-NEXT:    sshll2 v16.4s, v4.8h, #0
-; CHECK-GI-NEXT:    sshll2 v17.4s, v0.8h, #0
-; CHECK-GI-NEXT:    sshll2 v18.4s, v5.8h, #0
-; CHECK-GI-NEXT:    sshll2 v19.4s, v1.8h, #0
-; CHECK-GI-NEXT:    ushll2 v20.4s, v6.8h, #0
-; CHECK-GI-NEXT:    ushll2 v21.4s, v2.8h, #0
-; CHECK-GI-NEXT:    ushll2 v22.4s, v7.8h, #0
-; CHECK-GI-NEXT:    ushll2 v23.4s, v3.8h, #0
-; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v5.4s, v5.4h, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v20.4s
-; CHECK-GI-NEXT:    mul v17.4s, v17.4s, v21.4s
-; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
-; CHECK-GI-NEXT:    mul v18.4s, v18.4s, v22.4s
-; CHECK-GI-NEXT:    mul v19.4s, v19.4s, v23.4s
-; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    ushll v7.4s, v7.4h, #0
-; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    mla v16.4s, v4.4s, v6.4s
-; CHECK-GI-NEXT:    mla v17.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT:    mla v18.4s, v5.4s, v7.4s
-; CHECK-GI-NEXT:    mla v19.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    smull v16.4s, v4.4h, v6.4h
+; CHECK-GI-NEXT:    smull v17.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    smull v18.4s, v5.4h, v7.4h
+; CHECK-GI-NEXT:    smull v19.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    smlal2 v16.4s, v4.8h, v6.8h
+; CHECK-GI-NEXT:    smlal2 v17.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    smlal2 v18.4s, v5.8h, v7.8h
+; CHECK-GI-NEXT:    smlal2 v19.4s, v1.8h, v3.8h
 ; CHECK-GI-NEXT:    add v0.4s, v16.4s, v17.4s
 ; CHECK-GI-NEXT:    add v1.4s, v18.4s, v19.4s
 ; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
@@ -3920,19 +3832,6 @@ define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3
 ;
 ; CHECK-GI-LABEL: test_usdot_v32i8_double:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-GI-NEXT:    .cfi_offset b8, -8
-; CHECK-GI-NEXT:    .cfi_offset b9, -16
-; CHECK-GI-NEXT:    .cfi_offset b10, -24
-; CHECK-GI-NEXT:    .cfi_offset b11, -32
-; CHECK-GI-NEXT:    .cfi_offset b12, -40
-; CHECK-GI-NEXT:    .cfi_offset b13, -48
-; CHECK-GI-NEXT:    .cfi_offset b14, -56
-; CHECK-GI-NEXT:    .cfi_offset b15, -64
 ; CHECK-GI-NEXT:    ushll v16.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
 ; CHECK-GI-NEXT:    ushll v17.8h, v1.8b, #0
@@ -3941,69 +3840,34 @@ define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3
 ; CHECK-GI-NEXT:    sshll2 v2.8h, v2.16b, #0
 ; CHECK-GI-NEXT:    sshll v19.8h, v3.8b, #0
 ; CHECK-GI-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-GI-NEXT:    ushll v27.8h, v4.8b, #0
+; CHECK-GI-NEXT:    ushll v20.8h, v4.8b, #0
 ; CHECK-GI-NEXT:    ushll2 v4.8h, v4.16b, #0
-; CHECK-GI-NEXT:    ushll v28.8h, v5.8b, #0
-; CHECK-GI-NEXT:    sshll v29.8h, v6.8b, #0
-; CHECK-GI-NEXT:    sshll2 v6.8h, v6.16b, #0
+; CHECK-GI-NEXT:    ushll v21.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    ushll2 v5.8h, v5.16b, #0
-; CHECK-GI-NEXT:    sshll v30.8h, v7.8b, #0
+; CHECK-GI-NEXT:    sshll v22.8h, v6.8b, #0
+; CHECK-GI-NEXT:    sshll2 v6.8h, v6.16b, #0
+; CHECK-GI-NEXT:    sshll v23.8h, v7.8b, #0
 ; CHECK-GI-NEXT:    sshll2 v7.8h, v7.16b, #0
-; CHECK-GI-NEXT:    ushll2 v20.4s, v16.8h, #0
-; CHECK-GI-NEXT:    ushll2 v21.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v22.4s, v17.8h, #0
-; CHECK-GI-NEXT:    ushll2 v23.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll2 v24.4s, v18.8h, #0
-; CHECK-GI-NEXT:    sshll2 v25.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll2 v26.4s, v19.8h, #0
-; CHECK-GI-NEXT:    sshll2 v31.4s, v3.8h, #0
-; CHECK-GI-NEXT:    ushll2 v8.4s, v27.8h, #0
-; CHECK-GI-NEXT:    ushll2 v9.4s, v4.8h, #0
-; CHECK-GI-NEXT:    ushll2 v10.4s, v28.8h, #0
-; CHECK-GI-NEXT:    sshll2 v11.4s, v29.8h, #0
-; CHECK-GI-NEXT:    sshll2 v12.4s, v6.8h, #0
-; CHECK-GI-NEXT:    ushll2 v13.4s, v5.8h, #0
-; CHECK-GI-NEXT:    sshll2 v14.4s, v30.8h, #0
-; CHECK-GI-NEXT:    sshll2 v15.4s, v7.8h, #0
-; CHECK-GI-NEXT:    mul v20.4s, v20.4s, v24.4s
-; CHECK-GI-NEXT:    mul v21.4s, v21.4s, v25.4s
-; CHECK-GI-NEXT:    mul v22.4s, v22.4s, v26.4s
-; CHECK-GI-NEXT:    mul v23.4s, v23.4s, v31.4s
-; CHECK-GI-NEXT:    mul v24.4s, v8.4s, v11.4s
-; CHECK-GI-NEXT:    mul v25.4s, v9.4s, v12.4s
-; CHECK-GI-NEXT:    ushll v16.4s, v16.4h, #0
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mul v26.4s, v10.4s, v14.4s
-; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mul v31.4s, v13.4s, v15.4s
-; CHECK-GI-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    ushll v17.4s, v17.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll v18.4s, v18.4h, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll v19.4s, v19.4h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT:    ushll v27.4s, v27.4h, #0
-; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
-; CHECK-GI-NEXT:    ushll v28.4s, v28.4h, #0
-; CHECK-GI-NEXT:    ushll v5.4s, v5.4h, #0
-; CHECK-GI-NEXT:    sshll v29.4s, v29.4h, #0
-; CHECK-GI-NEXT:    sshll v6.4s, v6.4h, #0
-; CHECK-GI-NEXT:    sshll v30.4s, v30.4h, #0
-; CHECK-GI-NEXT:    sshll v7.4s, v7.4h, #0
-; CHECK-GI-NEXT:    mla v20.4s, v16.4s, v18.4s
-; CHECK-GI-NEXT:    mla v21.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT:    mla v22.4s, v17.4s, v19.4s
-; CHECK-GI-NEXT:    mla v23.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT:    mla v24.4s, v27.4s, v29.4s
-; CHECK-GI-NEXT:    mla v25.4s, v4.4s, v6.4s
-; CHECK-GI-NEXT:    mla v26.4s, v28.4s, v30.4s
-; CHECK-GI-NEXT:    mla v31.4s, v5.4s, v7.4s
-; CHECK-GI-NEXT:    add v0.4s, v20.4s, v21.4s
-; CHECK-GI-NEXT:    add v1.4s, v22.4s, v23.4s
-; CHECK-GI-NEXT:    add v2.4s, v24.4s, v25.4s
-; CHECK-GI-NEXT:    add v3.4s, v26.4s, v31.4s
+; CHECK-GI-NEXT:    smull v24.4s, v16.4h, v18.4h
+; CHECK-GI-NEXT:    smull v25.4s, v0.4h, v2.4h
+; CHECK-GI-NEXT:    smull v26.4s, v17.4h, v19.4h
+; CHECK-GI-NEXT:    smull v27.4s, v1.4h, v3.4h
+; CHECK-GI-NEXT:    smull v28.4s, v20.4h, v22.4h
+; CHECK-GI-NEXT:    smull v29.4s, v4.4h, v6.4h
+; CHECK-GI-NEXT:    smull v30.4s, v21.4h, v23.4h
+; CHECK-GI-NEXT:    smull v31.4s, v5.4h, v7.4h
+; CHECK-GI-NEXT:    smlal2 v24.4s, v16.8h, v18.8h
+; CHECK-GI-NEXT:    smlal2 v25.4s, v0.8h, v2.8h
+; CHECK-GI-NEXT:    smlal2 v26.4s, v17.8h, v19.8h
+; CHECK-GI-NEXT:    smlal2 v27.4s, v1.8h, v3.8h
+; CHECK-GI-NEXT:    smlal2 v28.4s, v20.8h, v22.8h
+; CHECK-GI-NEXT:    smlal2 v29.4s, v4.8h, v6.8h
+; CHECK-GI-NEXT:    smlal2 v30.4s, v21.8h, v23.8h
+; CHECK-GI-NEXT:    smlal2 v31.4s, v5.8h, v7.8h
+; CHECK-GI-NEXT:    add v0.4s, v24.4s, v25.4s
+; CHECK-GI-NEXT:    add v1.4s, v26.4s, v27.4s
+; CHECK-GI-NEXT:    add v2.4s, v28.4s, v29.4s
+; CHECK-GI-NEXT:    add v3.4s, v30.4s, v31.4s
 ; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
@@ -4011,7 +3875,6 @@ define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
   %az = zext <32 x i8> %a to <32 x i32>
@@ -7415,101 +7278,52 @@ define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %
 ;
 ; CHECK-GI-LABEL: test_usdot_v64i8:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-GI-NEXT:    .cfi_offset b8, -8
-; CHECK-GI-NEXT:    .cfi_offset b9, -16
-; CHECK-GI-NEXT:    .cfi_offset b10, -24
-; CHECK-GI-NEXT:    .cfi_offset b11, -32
-; CHECK-GI-NEXT:    .cfi_offset b12, -40
-; CHECK-GI-NEXT:    .cfi_offset b13, -48
-; CHECK-GI-NEXT:    .cfi_offset b14, -56
-; CHECK-GI-NEXT:    .cfi_offset b15, -64
-; CHECK-GI-NEXT:    ldp q0, q1, [x1]
-; CHECK-GI-NEXT:    ldp q21, q17, [x0]
-; CHECK-GI-NEXT:    ldp q3, q19, [x1, #32]
-; CHECK-GI-NEXT:    ldp q18, q4, [x0, #32]
-; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v5.8h, v0.16b, #0
-; CHECK-GI-NEXT:    sshll v7.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll2 v22.8h, v1.16b, #0
-; CHECK-GI-NEXT:    sshll v23.8h, v3.8b, #0
-; CHECK-GI-NEXT:    sshll2 v24.8h, v3.16b, #0
-; CHECK-GI-NEXT:    sshll v25.8h, v19.8b, #0
-; CHECK-GI-NEXT:    sshll2 v26.8h, v19.16b, #0
-; CHECK-GI-NEXT:    ushll v27.8h, v21.8b, #0
-; CHECK-GI-NEXT:    ushll2 v28.8h, v21.16b, #0
-; CHECK-GI-NEXT:    ushll v30.8h, v17.8b, #0
-; CHECK-GI-NEXT:    ushll2 v17.8h, v17.16b, #0
-; CHECK-GI-NEXT:    ushll v8.8h, v18.8b, #0
-; CHECK-GI-NEXT:    ushll2 v18.8h, v18.16b, #0
-; CHECK-GI-NEXT:    ushll v9.8h, v4.8b, #0
+; CHECK-GI-NEXT:    ldp q0, q1, [x0]
+; CHECK-GI-NEXT:    ldp q2, q5, [x1]
+; CHECK-GI-NEXT:    ldp q3, q4, [x0, #32]
+; CHECK-GI-NEXT:    ldp q6, q7, [x1, #32]
+; CHECK-GI-NEXT:    ushll v20.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v16.8h, v2.8b, #0
+; CHECK-GI-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-GI-NEXT:    sshll v17.8h, v5.8b, #0
+; CHECK-GI-NEXT:    sshll2 v5.8h, v5.16b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v21.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v18.8h, v6.8b, #0
+; CHECK-GI-NEXT:    sshll2 v6.8h, v6.16b, #0
+; CHECK-GI-NEXT:    sshll v19.8h, v7.8b, #0
+; CHECK-GI-NEXT:    sshll2 v7.8h, v7.16b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v22.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-NEXT:    ushll v23.8h, v4.8b, #0
 ; CHECK-GI-NEXT:    ushll2 v4.8h, v4.16b, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
-; CHECK-GI-NEXT:    sshll2 v6.4s, v2.8h, #0
-; CHECK-GI-NEXT:    sshll v1.4s, v5.4h, #0
-; CHECK-GI-NEXT:    sshll2 v16.4s, v5.8h, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v7.4h, #0
-; CHECK-GI-NEXT:    sshll2 v20.4s, v7.8h, #0
-; CHECK-GI-NEXT:    sshll v3.4s, v22.4h, #0
-; CHECK-GI-NEXT:    sshll2 v22.4s, v22.8h, #0
-; CHECK-GI-NEXT:    sshll v5.4s, v23.4h, #0
-; CHECK-GI-NEXT:    sshll2 v23.4s, v23.8h, #0
-; CHECK-GI-NEXT:    sshll v7.4s, v24.4h, #0
-; CHECK-GI-NEXT:    sshll2 v24.4s, v24.8h, #0
-; CHECK-GI-NEXT:    sshll v19.4s, v25.4h, #0
-; CHECK-GI-NEXT:    sshll2 v25.4s, v25.8h, #0
-; CHECK-GI-NEXT:    sshll v21.4s, v26.4h, #0
-; CHECK-GI-NEXT:    sshll2 v26.4s, v26.8h, #0
-; CHECK-GI-NEXT:    ushll v29.4s, v27.4h, #0
-; CHECK-GI-NEXT:    ushll2 v27.4s, v27.8h, #0
-; CHECK-GI-NEXT:    ushll v31.4s, v28.4h, #0
-; CHECK-GI-NEXT:    ushll2 v28.4s, v28.8h, #0
-; CHECK-GI-NEXT:    ushll v10.4s, v30.4h, #0
-; CHECK-GI-NEXT:    ushll2 v30.4s, v30.8h, #0
-; CHECK-GI-NEXT:    ushll v11.4s, v17.4h, #0
-; CHECK-GI-NEXT:    ushll2 v17.4s, v17.8h, #0
-; CHECK-GI-NEXT:    ushll2 v12.4s, v8.8h, #0
-; CHECK-GI-NEXT:    ushll2 v13.4s, v18.8h, #0
-; CHECK-GI-NEXT:    ushll2 v14.4s, v9.8h, #0
-; CHECK-GI-NEXT:    ushll2 v15.4s, v4.8h, #0
-; CHECK-GI-NEXT:    mul v6.4s, v6.4s, v27.4s
-; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v28.4s
-; CHECK-GI-NEXT:    mul v20.4s, v20.4s, v30.4s
-; CHECK-GI-NEXT:    mul v17.4s, v22.4s, v17.4s
-; CHECK-GI-NEXT:    ushll v8.4s, v8.4h, #0
-; CHECK-GI-NEXT:    mul v22.4s, v23.4s, v12.4s
-; CHECK-GI-NEXT:    mul v23.4s, v24.4s, v13.4s
-; CHECK-GI-NEXT:    mul v24.4s, v25.4s, v14.4s
-; CHECK-GI-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mul v25.4s, v26.4s, v15.4s
-; CHECK-GI-NEXT:    ushll v18.4s, v18.4h, #0
-; CHECK-GI-NEXT:    ushll v26.4s, v9.4h, #0
-; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
-; CHECK-GI-NEXT:    mla v6.4s, v0.4s, v29.4s
-; CHECK-GI-NEXT:    mla v16.4s, v1.4s, v31.4s
-; CHECK-GI-NEXT:    mla v20.4s, v2.4s, v10.4s
-; CHECK-GI-NEXT:    mla v17.4s, v3.4s, v11.4s
-; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mla v22.4s, v5.4s, v8.4s
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mla v23.4s, v7.4s, v18.4s
-; CHECK-GI-NEXT:    mla v24.4s, v19.4s, v26.4s
-; CHECK-GI-NEXT:    mla v25.4s, v21.4s, v4.4s
-; CHECK-GI-NEXT:    add v0.4s, v6.4s, v16.4s
-; CHECK-GI-NEXT:    add v1.4s, v20.4s, v17.4s
-; CHECK-GI-NEXT:    add v2.4s, v22.4s, v23.4s
-; CHECK-GI-NEXT:    add v3.4s, v24.4s, v25.4s
+; CHECK-GI-NEXT:    smull v24.4s, v16.4h, v20.4h
+; CHECK-GI-NEXT:    smull v25.4s, v2.4h, v0.4h
+; CHECK-GI-NEXT:    smull v26.4s, v17.4h, v21.4h
+; CHECK-GI-NEXT:    smull v27.4s, v5.4h, v1.4h
+; CHECK-GI-NEXT:    smull v28.4s, v18.4h, v22.4h
+; CHECK-GI-NEXT:    smull v29.4s, v6.4h, v3.4h
+; CHECK-GI-NEXT:    smull v30.4s, v19.4h, v23.4h
+; CHECK-GI-NEXT:    smull v31.4s, v7.4h, v4.4h
+; CHECK-GI-NEXT:    smlal2 v24.4s, v16.8h, v20.8h
+; CHECK-GI-NEXT:    smlal2 v25.4s, v2.8h, v0.8h
+; CHECK-GI-NEXT:    smlal2 v26.4s, v17.8h, v21.8h
+; CHECK-GI-NEXT:    smlal2 v27.4s, v5.8h, v1.8h
+; CHECK-GI-NEXT:    smlal2 v28.4s, v18.8h, v22.8h
+; CHECK-GI-NEXT:    smlal2 v29.4s, v6.8h, v3.8h
+; CHECK-GI-NEXT:    smlal2 v30.4s, v19.8h, v23.8h
+; CHECK-GI-NEXT:    smlal2 v31.4s, v7.8h, v4.8h
+; CHECK-GI-NEXT:    add v0.4s, v24.4s, v25.4s
+; CHECK-GI-NEXT:    add v1.4s, v26.4s, v27.4s
+; CHECK-GI-NEXT:    add v2.4s, v28.4s, v29.4s
+; CHECK-GI-NEXT:    add v3.4s, v30.4s, v31.4s
 ; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
 ; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    add w0, w8, w2
-; CHECK-GI-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    ret
 entry:
   %0 = load <64 x i8>, ptr %a
@@ -7558,13 +7372,13 @@ define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <6
 ;
 ; CHECK-GI-LABEL: test_usdot_v64i8_double:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sub sp, sp, #304
-; CHECK-GI-NEXT:    stp d15, d14, [sp, #224] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp d13, d12, [sp, #240] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp d11, d10, [sp, #256] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp d9, d8, [sp, #272] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 304
+; CHECK-GI-NEXT:    sub sp, sp, #240
+; CHECK-GI-NEXT:    stp d15, d14, [sp, #160] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d13, d12, [sp, #176] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d11, d10, [sp, #192] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #208] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x29, [sp, #224] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 240
 ; CHECK-GI-NEXT:    .cfi_offset w29, -16
 ; CHECK-GI-NEXT:    .cfi_offset b8, -24
 ; CHECK-GI-NEXT:    .cfi_offset b9, -32
@@ -7574,190 +7388,114 @@ define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <6
 ; CHECK-GI-NEXT:    .cfi_offset b13, -64
 ; CHECK-GI-NEXT:    .cfi_offset b14, -72
 ; CHECK-GI-NEXT:    .cfi_offset b15, -80
-; CHECK-GI-NEXT:    ushll v17.8h, v0.8b, #0
-; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    mov v20.16b, v3.16b
-; CHECK-GI-NEXT:    ushll v16.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll2 v18.8h, v1.16b, #0
-; CHECK-GI-NEXT:    ushll v26.8h, v2.8b, #0
-; CHECK-GI-NEXT:    ldp q27, q28, [sp, #304]
-; CHECK-GI-NEXT:    ushll2 v29.8h, v2.16b, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v17.4h, #0
-; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll v8.8h, v4.8b, #0
-; CHECK-GI-NEXT:    ldp q23, q21, [sp, #368]
-; CHECK-GI-NEXT:    sshll2 v9.8h, v4.16b, #0
-; CHECK-GI-NEXT:    sshll2 v11.8h, v5.16b, #0
-; CHECK-GI-NEXT:    mov v25.16b, v7.16b
-; CHECK-GI-NEXT:    ushll2 v19.4s, v17.8h, #0
-; CHECK-GI-NEXT:    stp q1, q2, [sp, #192] // 32-byte Folded Spill
-; CHECK-GI-NEXT:    ushll2 v3.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v17.4s, v18.8h, #0
-; CHECK-GI-NEXT:    ldp q24, q22, [sp, #336]
-; CHECK-GI-NEXT:    sshll v10.8h, v5.8b, #0
-; CHECK-GI-NEXT:    sshll v12.8h, v6.8b, #0
-; CHECK-GI-NEXT:    sshll2 v13.8h, v6.16b, #0
-; CHECK-GI-NEXT:    mov v2.16b, v20.16b
-; CHECK-GI-NEXT:    sshll2 v0.4s, v8.8h, #0
-; CHECK-GI-NEXT:    sshll2 v4.4s, v9.8h, #0
-; CHECK-GI-NEXT:    sshll2 v6.4s, v11.8h, #0
-; CHECK-GI-NEXT:    ushll2 v7.4s, v16.8h, #0
-; CHECK-GI-NEXT:    ushll2 v31.4s, v29.8h, #0
-; CHECK-GI-NEXT:    sshll2 v5.4s, v10.8h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v13.8h, #0
-; CHECK-GI-NEXT:    ushll2 v30.4s, v26.8h, #0
-; CHECK-GI-NEXT:    ushll v14.8h, v2.8b, #0
-; CHECK-GI-NEXT:    mul v20.4s, v19.4s, v0.4s
-; CHECK-GI-NEXT:    mul v19.4s, v3.4s, v4.4s
-; CHECK-GI-NEXT:    sshll v0.8h, v25.8b, #0
-; CHECK-GI-NEXT:    mul v4.4s, v17.4s, v6.4s
-; CHECK-GI-NEXT:    sshll2 v15.4s, v12.8h, #0
-; CHECK-GI-NEXT:    ldp q17, q3, [sp, #400]
-; CHECK-GI-NEXT:    mul v5.4s, v7.4s, v5.4s
-; CHECK-GI-NEXT:    mul v7.4s, v31.4s, v1.4s
-; CHECK-GI-NEXT:    ushll2 v31.8h, v2.16b, #0
-; CHECK-GI-NEXT:    sshll2 v25.8h, v25.16b, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v2.4s, v14.4h, #0
-; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT:    str q3, [sp, #96] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    ushll2 v3.4s, v14.8h, #0
-; CHECK-GI-NEXT:    mul v6.4s, v30.4s, v15.4s
-; CHECK-GI-NEXT:    str q31, [sp, #160] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    ushll v30.4s, v26.4h, #0
-; CHECK-GI-NEXT:    sshll v26.4s, v8.4h, #0
-; CHECK-GI-NEXT:    ushll v14.8h, v27.8b, #0
-; CHECK-GI-NEXT:    ushll v15.4s, v29.4h, #0
-; CHECK-GI-NEXT:    sshll v29.4s, v9.4h, #0
-; CHECK-GI-NEXT:    mul v1.4s, v3.4s, v1.4s
-; CHECK-GI-NEXT:    ushll2 v3.4s, v31.8h, #0
-; CHECK-GI-NEXT:    ushll v31.8h, v28.8b, #0
-; CHECK-GI-NEXT:    ushll v16.4s, v16.4h, #0
-; CHECK-GI-NEXT:    sshll v8.4s, v10.4h, #0
-; CHECK-GI-NEXT:    sshll v9.4s, v11.4h, #0
-; CHECK-GI-NEXT:    sshll v10.4s, v12.4h, #0
-; CHECK-GI-NEXT:    sshll v11.4s, v13.4h, #0
-; CHECK-GI-NEXT:    ushll v18.4s, v18.4h, #0
-; CHECK-GI-NEXT:    stp q3, q25, [sp, #112] // 32-byte Folded Spill
-; CHECK-GI-NEXT:    ldr q3, [sp, #208] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ushll2 v28.8h, v28.16b, #0
-; CHECK-GI-NEXT:    mla v1.4s, v2.4s, v0.4s
-; CHECK-GI-NEXT:    ushll2 v0.4s, v31.8h, #0
-; CHECK-GI-NEXT:    mla v5.4s, v16.4s, v8.4s
-; CHECK-GI-NEXT:    mla v20.4s, v3.4s, v26.4s
-; CHECK-GI-NEXT:    sshll2 v3.4s, v25.8h, #0
-; CHECK-GI-NEXT:    mla v6.4s, v30.4s, v10.4s
-; CHECK-GI-NEXT:    mla v7.4s, v15.4s, v11.4s
-; CHECK-GI-NEXT:    sshll v25.8h, v23.8b, #0
-; CHECK-GI-NEXT:    mla v4.4s, v18.4s, v9.4s
-; CHECK-GI-NEXT:    ushll v30.8h, v22.8b, #0
-; CHECK-GI-NEXT:    ushll2 v26.8h, v22.16b, #0
-; CHECK-GI-NEXT:    sshll v22.8h, v21.8b, #0
-; CHECK-GI-NEXT:    str q3, [sp, #32] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ushll2 v8.8h, v27.16b, #0
-; CHECK-GI-NEXT:    str q1, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    ldr q9, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ushll2 v1.4s, v14.8h, #0
-; CHECK-GI-NEXT:    stp q7, q6, [sp, #64] // 32-byte Folded Spill
-; CHECK-GI-NEXT:    mla v19.4s, v3.4s, v29.4s
-; CHECK-GI-NEXT:    sshll2 v7.4s, v25.8h, #0
-; CHECK-GI-NEXT:    str q5, [sp, #176] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    ushll v29.8h, v24.8b, #0
-; CHECK-GI-NEXT:    ushll2 v27.8h, v24.16b, #0
-; CHECK-GI-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
-; CHECK-GI-NEXT:    ldp q0, q16, [sp, #96] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    str q4, [sp, #144] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    sshll2 v24.8h, v23.16b, #0
-; CHECK-GI-NEXT:    ushll2 v18.4s, v26.8h, #0
-; CHECK-GI-NEXT:    stp q19, q20, [sp, #192] // 32-byte Folded Spill
-; CHECK-GI-NEXT:    sshll2 v20.8h, v21.16b, #0
-; CHECK-GI-NEXT:    sshll v21.8h, v17.8b, #0
+; CHECK-GI-NEXT:    ushll v31.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v8.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ldr x29, [sp, #224] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    sshll v11.8h, v4.8b, #0
+; CHECK-GI-NEXT:    sshll2 v12.8h, v4.16b, #0
+; CHECK-GI-NEXT:    ushll v9.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v10.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ldp q25, q22, [sp, #240]
+; CHECK-GI-NEXT:    sshll v13.8h, v5.8b, #0
+; CHECK-GI-NEXT:    sshll2 v14.8h, v5.16b, #0
+; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
+; CHECK-GI-NEXT:    smull v19.4s, v31.4h, v11.4h
+; CHECK-GI-NEXT:    ldp q21, q18, [sp, #272]
+; CHECK-GI-NEXT:    smull v20.4s, v8.4h, v12.4h
+; CHECK-GI-NEXT:    sshll v4.8h, v6.8b, #0
+; CHECK-GI-NEXT:    sshll2 v5.8h, v6.16b, #0
+; CHECK-GI-NEXT:    smull v28.4s, v9.4h, v13.4h
+; CHECK-GI-NEXT:    ldp q17, q16, [sp, #304]
+; CHECK-GI-NEXT:    smull v27.4s, v10.4h, v14.4h
+; CHECK-GI-NEXT:    sshll v6.8h, v7.8b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v2.16b, #0
+; CHECK-GI-NEXT:    smlal2 v19.4s, v31.8h, v11.8h
+; CHECK-GI-NEXT:    ldp q30, q29, [sp, #336]
+; CHECK-GI-NEXT:    smlal2 v20.4s, v8.8h, v12.8h
+; CHECK-GI-NEXT:    ushll v2.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-NEXT:    smlal2 v28.4s, v9.8h, v13.8h
+; CHECK-GI-NEXT:    stp q0, q6, [sp, #48] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    sshll2 v7.8h, v7.16b, #0
+; CHECK-GI-NEXT:    smlal2 v27.4s, v10.8h, v14.8h
+; CHECK-GI-NEXT:    smull v26.4s, v0.4h, v4.4h
+; CHECK-GI-NEXT:    ushll v31.8h, v25.8b, #0
+; CHECK-GI-NEXT:    str q19, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ushll2 v25.8h, v25.16b, #0
+; CHECK-GI-NEXT:    ushll v8.8h, v22.8b, #0
+; CHECK-GI-NEXT:    stp q2, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ushll2 v22.8h, v22.16b, #0
+; CHECK-GI-NEXT:    ushll v9.8h, v21.8b, #0
+; CHECK-GI-NEXT:    stp q5, q28, [sp, #80] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ushll2 v21.8h, v21.16b, #0
+; CHECK-GI-NEXT:    ushll v10.8h, v18.8b, #0
+; CHECK-GI-NEXT:    stp q4, q20, [sp, #112] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ushll2 v20.8h, v18.16b, #0
+; CHECK-GI-NEXT:    sshll v11.8h, v17.8b, #0
+; CHECK-GI-NEXT:    str q27, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q28, [sp, #112] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q27, [sp, #48] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    sshll2 v19.8h, v17.16b, #0
-; CHECK-GI-NEXT:    sshll2 v17.8h, v0.16b, #0
-; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v9.4s
-; CHECK-GI-NEXT:    ldr q9, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    sshll v23.8h, v0.8b, #0
-; CHECK-GI-NEXT:    sshll2 v2.4s, v22.8h, #0
-; CHECK-GI-NEXT:    ushll2 v12.4s, v27.8h, #0
-; CHECK-GI-NEXT:    ushll v26.4s, v26.4h, #0
-; CHECK-GI-NEXT:    ushll2 v10.4s, v28.8h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v17.8h, #0
-; CHECK-GI-NEXT:    mul v7.4s, v9.4s, v7.4s
-; CHECK-GI-NEXT:    ldr q9, [sp] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    sshll2 v5.4s, v19.8h, #0
-; CHECK-GI-NEXT:    sshll v17.4s, v17.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v20.8h, #0
-; CHECK-GI-NEXT:    mul v2.4s, v9.4s, v2.4s
-; CHECK-GI-NEXT:    ldr q9, [sp, #128] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ushll2 v15.4s, v8.8h, #0
-; CHECK-GI-NEXT:    mul v0.4s, v18.4s, v0.4s
-; CHECK-GI-NEXT:    ldr q18, [sp, #160] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ushll2 v11.4s, v29.8h, #0
-; CHECK-GI-NEXT:    sshll v9.4s, v9.4h, #0
-; CHECK-GI-NEXT:    ushll2 v13.4s, v30.8h, #0
-; CHECK-GI-NEXT:    sshll2 v1.4s, v24.8h, #0
-; CHECK-GI-NEXT:    ushll v18.4s, v18.4h, #0
-; CHECK-GI-NEXT:    sshll2 v4.4s, v21.8h, #0
-; CHECK-GI-NEXT:    sshll2 v6.4s, v23.8h, #0
-; CHECK-GI-NEXT:    mul v5.4s, v12.4s, v5.4s
-; CHECK-GI-NEXT:    ushll v27.4s, v27.4h, #0
-; CHECK-GI-NEXT:    sshll v19.4s, v19.4h, #0
-; CHECK-GI-NEXT:    mla v0.4s, v26.4s, v17.4s
-; CHECK-GI-NEXT:    mul v3.4s, v10.4s, v3.4s
-; CHECK-GI-NEXT:    mul v1.4s, v15.4s, v1.4s
-; CHECK-GI-NEXT:    mla v16.4s, v18.4s, v9.4s
-; CHECK-GI-NEXT:    ldp q18, q17, [sp, #192] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mul v4.4s, v11.4s, v4.4s
-; CHECK-GI-NEXT:    mul v6.4s, v13.4s, v6.4s
-; CHECK-GI-NEXT:    ushll v28.4s, v28.4h, #0
-; CHECK-GI-NEXT:    ldp d13, d12, [sp, #240] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    sshll v20.4s, v20.4h, #0
-; CHECK-GI-NEXT:    ushll v10.4s, v14.4h, #0
-; CHECK-GI-NEXT:    ldp d15, d14, [sp, #224] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    ushll v8.4s, v8.4h, #0
-; CHECK-GI-NEXT:    ushll v31.4s, v31.4h, #0
-; CHECK-GI-NEXT:    ushll v29.4s, v29.4h, #0
-; CHECK-GI-NEXT:    ushll v30.4s, v30.4h, #0
-; CHECK-GI-NEXT:    sshll v25.4s, v25.4h, #0
-; CHECK-GI-NEXT:    sshll v24.4s, v24.4h, #0
-; CHECK-GI-NEXT:    sshll v22.4s, v22.4h, #0
-; CHECK-GI-NEXT:    sshll v21.4s, v21.4h, #0
-; CHECK-GI-NEXT:    sshll v23.4s, v23.4h, #0
-; CHECK-GI-NEXT:    mla v5.4s, v27.4s, v19.4s
-; CHECK-GI-NEXT:    ldr q19, [sp, #144] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    add v17.4s, v17.4s, v18.4s
-; CHECK-GI-NEXT:    ldr q18, [sp, #176] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mla v3.4s, v28.4s, v20.4s
-; CHECK-GI-NEXT:    mla v7.4s, v10.4s, v25.4s
-; CHECK-GI-NEXT:    ldp d11, d10, [sp, #256] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    mla v1.4s, v8.4s, v24.4s
-; CHECK-GI-NEXT:    ldp d9, d8, [sp, #272] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    add v18.4s, v18.4s, v19.4s
-; CHECK-GI-NEXT:    ldp q20, q19, [sp, #64] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    mla v2.4s, v31.4s, v22.4s
-; CHECK-GI-NEXT:    mla v4.4s, v29.4s, v21.4s
-; CHECK-GI-NEXT:    mla v6.4s, v30.4s, v23.4s
-; CHECK-GI-NEXT:    add v1.4s, v7.4s, v1.4s
-; CHECK-GI-NEXT:    add v19.4s, v19.4s, v20.4s
-; CHECK-GI-NEXT:    ldr q20, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    add v2.4s, v2.4s, v3.4s
-; CHECK-GI-NEXT:    add v16.4s, v20.4s, v16.4s
-; CHECK-GI-NEXT:    add v3.4s, v4.4s, v5.4s
-; CHECK-GI-NEXT:    add v0.4s, v6.4s, v0.4s
-; CHECK-GI-NEXT:    add v4.4s, v17.4s, v18.4s
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-GI-NEXT:    add v5.4s, v19.4s, v16.4s
-; CHECK-GI-NEXT:    add v0.4s, v3.4s, v0.4s
-; CHECK-GI-NEXT:    add v2.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT:    sshll v12.8h, v16.8b, #0
+; CHECK-GI-NEXT:    sshll2 v18.8h, v16.16b, #0
+; CHECK-GI-NEXT:    sshll v13.8h, v30.8b, #0
+; CHECK-GI-NEXT:    sshll2 v30.8h, v30.16b, #0
+; CHECK-GI-NEXT:    sshll v14.8h, v29.8b, #0
+; CHECK-GI-NEXT:    sshll2 v29.8h, v29.16b, #0
+; CHECK-GI-NEXT:    smull v23.4s, v1.4h, v5.4h
+; CHECK-GI-NEXT:    smull v15.4s, v3.4h, v7.4h
+; CHECK-GI-NEXT:    smull v24.4s, v2.4h, v6.4h
+; CHECK-GI-NEXT:    smull v17.4s, v31.4h, v11.4h
+; CHECK-GI-NEXT:    smull v6.4s, v25.4h, v19.4h
+; CHECK-GI-NEXT:    smull v16.4s, v8.4h, v12.4h
+; CHECK-GI-NEXT:    smull v4.4s, v22.4h, v18.4h
+; CHECK-GI-NEXT:    smull v5.4s, v9.4h, v13.4h
+; CHECK-GI-NEXT:    smull v2.4s, v21.4h, v30.4h
+; CHECK-GI-NEXT:    smull v1.4s, v10.4h, v14.4h
+; CHECK-GI-NEXT:    smull v0.4s, v20.4h, v29.4h
+; CHECK-GI-NEXT:    smlal2 v26.4s, v27.8h, v28.8h
+; CHECK-GI-NEXT:    ldr q28, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q27, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    smlal2 v15.4s, v3.8h, v7.8h
+; CHECK-GI-NEXT:    ldp q7, q3, [sp, #128] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    smlal2 v23.4s, v27.8h, v28.8h
+; CHECK-GI-NEXT:    ldr q28, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldr q27, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    smlal2 v17.4s, v31.8h, v11.8h
+; CHECK-GI-NEXT:    smlal2 v6.4s, v25.8h, v19.8h
+; CHECK-GI-NEXT:    smlal2 v16.4s, v8.8h, v12.8h
+; CHECK-GI-NEXT:    smlal2 v24.4s, v27.8h, v28.8h
+; CHECK-GI-NEXT:    smlal2 v4.4s, v22.8h, v18.8h
+; CHECK-GI-NEXT:    ldr q18, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    smlal2 v5.4s, v9.8h, v13.8h
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #208] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ldp d13, d12, [sp, #176] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    smlal2 v2.4s, v21.8h, v30.8h
+; CHECK-GI-NEXT:    smlal2 v1.4s, v10.8h, v14.8h
+; CHECK-GI-NEXT:    ldp d11, d10, [sp, #192] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    smlal2 v0.4s, v20.8h, v29.8h
+; CHECK-GI-NEXT:    add v3.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    ldr q7, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add v19.4s, v24.4s, v15.4s
+; CHECK-GI-NEXT:    ldp d15, d14, [sp, #160] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add v7.4s, v7.4s, v18.4s
+; CHECK-GI-NEXT:    add v18.4s, v26.4s, v23.4s
+; CHECK-GI-NEXT:    add v6.4s, v17.4s, v6.4s
+; CHECK-GI-NEXT:    add v4.4s, v16.4s, v4.4s
+; CHECK-GI-NEXT:    add v2.4s, v5.4s, v2.4s
 ; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    addv s1, v2.4s
+; CHECK-GI-NEXT:    add v1.4s, v3.4s, v7.4s
+; CHECK-GI-NEXT:    add v3.4s, v18.4s, v19.4s
+; CHECK-GI-NEXT:    add v4.4s, v6.4s, v4.4s
+; CHECK-GI-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v4.4s, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
 ; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    fmov w9, s0
 ; CHECK-GI-NEXT:    add w0, w8, w9
-; CHECK-GI-NEXT:    add sp, sp, #304
+; CHECK-GI-NEXT:    add sp, sp, #240
 ; CHECK-GI-NEXT:    ret
 entry:
   %az = zext <64 x i8> %a to <64 x i32>
diff --git a/llvm/test/CodeGen/AArch64/neon-extmul.ll b/llvm/test/CodeGen/AArch64/neon-extmul.ll
index f83ac8ed642cc..c82f8e19f329a 100644
--- a/llvm/test/CodeGen/AArch64/neon-extmul.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extmul.ll
@@ -57,14 +57,10 @@ define <8 x i32> @extmulsu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1) {
 ;
 ; CHECK-GI-LABEL: extmulsu_v8i8_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v3.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v0.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v0.4s
-; CHECK-GI-NEXT:    mul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    smull v0.4s, v2.4h, v1.4h
+; CHECK-GI-NEXT:    smull2 v1.4s, v2.8h, v1.8h
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i32>
@@ -138,12 +134,8 @@ define <8 x i32> @extmuladdsu_v8i8_i32(<8 x i8> %s0, <8 x i8> %s1, <8 x i32> %b)
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    sshll v4.4s, v0.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll v5.4s, v1.4h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    mla v2.4s, v4.4s, v5.4s
-; CHECK-GI-NEXT:    mla v3.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    smlal v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    smlal2 v3.4s, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    mov v0.16b, v2.16b
 ; CHECK-GI-NEXT:    mov v1.16b, v3.16b
 ; CHECK-GI-NEXT:    ret
@@ -242,48 +234,12 @@ define <8 x i64> @extaddsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1) {
 ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-GI-NEXT:    sshll v2.4s, v0.4h, #0
 ; CHECK-GI-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v4.2d, v2.2s, #0
-; CHECK-GI-NEXT:    ushll v5.2d, v3.2s, #0
-; CHECK-GI-NEXT:    sshll2 v2.2d, v2.4s, #0
-; CHECK-GI-NEXT:    ushll2 v3.2d, v3.4s, #0
-; CHECK-GI-NEXT:    sshll v6.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v7.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-GI-NEXT:    fmov x8, d4
-; CHECK-GI-NEXT:    fmov x9, d5
-; CHECK-GI-NEXT:    mov x12, v4.d[1]
-; CHECK-GI-NEXT:    fmov x10, d3
-; CHECK-GI-NEXT:    fmov x11, d7
-; CHECK-GI-NEXT:    mov x13, v5.d[1]
-; CHECK-GI-NEXT:    fmov x14, d1
-; CHECK-GI-NEXT:    mov x15, v2.d[1]
-; CHECK-GI-NEXT:    mov x16, v3.d[1]
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov x9, d2
-; CHECK-GI-NEXT:    mov x17, v7.d[1]
-; CHECK-GI-NEXT:    mov x18, v1.d[1]
-; CHECK-GI-NEXT:    mul x12, x12, x13
-; CHECK-GI-NEXT:    mov x13, v0.d[1]
-; CHECK-GI-NEXT:    mul x9, x9, x10
-; CHECK-GI-NEXT:    fmov x10, d6
-; CHECK-GI-NEXT:    mul x15, x15, x16
-; CHECK-GI-NEXT:    mul x10, x10, x11
-; CHECK-GI-NEXT:    fmov x11, d0
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v1.d[0], x9
-; CHECK-GI-NEXT:    mul x13, x13, x18
-; CHECK-GI-NEXT:    mul x11, x11, x14
-; CHECK-GI-NEXT:    mov x14, v6.d[1]
-; CHECK-GI-NEXT:    mov v0.d[1], x12
-; CHECK-GI-NEXT:    mov v2.d[0], x10
-; CHECK-GI-NEXT:    mov v1.d[1], x15
-; CHECK-GI-NEXT:    mul x14, x14, x17
-; CHECK-GI-NEXT:    mov v3.d[0], x11
-; CHECK-GI-NEXT:    mov v2.d[1], x14
-; CHECK-GI-NEXT:    mov v3.d[1], x13
+; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    smull v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    smull2 v1.2d, v2.4s, v3.4s
+; CHECK-GI-NEXT:    smull v2.2d, v4.2s, v5.2s
+; CHECK-GI-NEXT:    smull2 v3.2d, v4.4s, v5.4s
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i64>
@@ -395,50 +351,14 @@ define <8 x i64> @extmuladdsu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b)
 ; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
 ; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
 ; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-GI-NEXT:    sshll v16.2d, v6.2s, #0
-; CHECK-GI-NEXT:    ushll v17.2d, v7.2s, #0
-; CHECK-GI-NEXT:    sshll2 v6.2d, v6.4s, #0
-; CHECK-GI-NEXT:    ushll2 v7.2d, v7.4s, #0
-; CHECK-GI-NEXT:    sshll v18.2d, v0.2s, #0
-; CHECK-GI-NEXT:    ushll v19.2d, v1.2s, #0
-; CHECK-GI-NEXT:    sshll2 v0.2d, v0.4s, #0
-; CHECK-GI-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-GI-NEXT:    fmov x8, d16
-; CHECK-GI-NEXT:    fmov x9, d17
-; CHECK-GI-NEXT:    mov x12, v16.d[1]
-; CHECK-GI-NEXT:    fmov x10, d7
-; CHECK-GI-NEXT:    fmov x11, d19
-; CHECK-GI-NEXT:    mov x13, v17.d[1]
-; CHECK-GI-NEXT:    fmov x14, d1
-; CHECK-GI-NEXT:    mov x15, v6.d[1]
-; CHECK-GI-NEXT:    mov x16, v7.d[1]
-; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    fmov x9, d6
-; CHECK-GI-NEXT:    mov x17, v19.d[1]
-; CHECK-GI-NEXT:    mov x18, v1.d[1]
-; CHECK-GI-NEXT:    mul x12, x12, x13
-; CHECK-GI-NEXT:    mov x13, v0.d[1]
-; CHECK-GI-NEXT:    mul x9, x9, x10
-; CHECK-GI-NEXT:    fmov x10, d18
-; CHECK-GI-NEXT:    mul x15, x15, x16
-; CHECK-GI-NEXT:    mul x10, x10, x11
-; CHECK-GI-NEXT:    fmov x11, d0
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v1.d[0], x9
-; CHECK-GI-NEXT:    mul x13, x13, x18
-; CHECK-GI-NEXT:    mul x11, x11, x14
-; CHECK-GI-NEXT:    mov x14, v18.d[1]
-; CHECK-GI-NEXT:    mov v0.d[1], x12
-; CHECK-GI-NEXT:    mov v6.d[0], x10
-; CHECK-GI-NEXT:    mov v1.d[1], x15
-; CHECK-GI-NEXT:    mul x14, x14, x17
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-GI-NEXT:    mov v7.d[0], x11
-; CHECK-GI-NEXT:    add v1.2d, v1.2d, v3.2d
-; CHECK-GI-NEXT:    mov v6.d[1], x14
-; CHECK-GI-NEXT:    mov v7.d[1], x13
-; CHECK-GI-NEXT:    add v2.2d, v6.2d, v4.2d
-; CHECK-GI-NEXT:    add v3.2d, v7.2d, v5.2d
+; CHECK-GI-NEXT:    smlal v2.2d, v6.2s, v7.2s
+; CHECK-GI-NEXT:    smlal2 v3.2d, v6.4s, v7.4s
+; CHECK-GI-NEXT:    smlal v4.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    smlal2 v5.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    mov v2.16b, v4.16b
+; CHECK-GI-NEXT:    mov v3.16b, v5.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %s0s = sext <8 x i8> %s0 to <8 x i64>



More information about the llvm-commits mailing list