[llvm] [AArch64][GlobalISel] Select UMULL instruction (PR #65469)

via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 6 05:34:51 PDT 2023


https://github.com/chuongg3 created https://github.com/llvm/llvm-project/pull/65469:

Global ISel now selects `UMULL` and `UMULL2` instructions.
G_MUL instruction with input operands coming from `SEXT` or `ZEXT` operations are turned into UMULL

G_MUL instructions with v2s64 result type is always scalarised except: 
`mul ( unmerge( ext ), unmerge( ext ))` 

So the extend could be unmerged and fold away the unmerge in the middle: 
`mul ( unmerge( ext ), unmerge( ext ))` =>
`mul ( unmerge( merge( ext( unmerge )), unmerge( merge( ext( unmerge ))))` =>
`mul ( ext(unmerge)), ( ext( unmerge ))) `

>From eb78b42d30b7650673f8a9ab9d225cee03a0ad83 Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh at arm.com>
Date: Wed, 6 Sep 2023 11:04:20 +0100
Subject: [PATCH] [AArch64][GlobalISel] Select UMULL instruction

---
 llvm/lib/Target/AArch64/AArch64InstrGISel.td  |   15 +
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   75 +-
 .../AArch64/GISel/AArch64LegalizerInfo.h      |    3 +
 .../GlobalISel/legalizer-info-validation.mir  |   26 +-
 llvm/test/CodeGen/AArch64/aarch64-smull.ll    | 1543 +++++++++++++----
 llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll |  291 +++-
 6 files changed, 1512 insertions(+), 441 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index f9f860607b5877..02d3b68486b825 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -215,6 +215,18 @@ def G_PREFETCH : AArch64GenericInstruction {
   let hasSideEffects = 1;
 }
 
+def G_UMULL : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+}
+
+def G_SMULL : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+}
+
 // Generic bitwise insert if true.
 def G_BIT : AArch64GenericInstruction {
   let OutOperandList = (outs type0:$dst);
@@ -254,6 +266,9 @@ def : GINodeEquiv<G_FCMLTZ, AArch64fcmltz>;
 
 def : GINodeEquiv<G_BIT, AArch64bit>;
 
+def : GINodeEquiv<G_UMULL, AArch64umull>;
+def : GINodeEquiv<G_SMULL, AArch64smull>;
+
 def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
 def : GINodeEquiv<G_PREFETCH, AArch64Prefetch>;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index e2df8fb1321df8..3c43771088d7bc 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -119,13 +119,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampScalar(0, s32, s64);
 
   getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
-      .legalFor({s32, s64, v2s32, v4s32, v4s16, v8s16, v16s8, v8s8})
-      .scalarizeIf(
-          [=](const LegalityQuery &Query) {
-            return Query.Opcode == G_MUL && Query.Types[0] == v2s64;
-          },
-          0)
-      .legalFor({v2s64})
+      .customIf([=](const LegalityQuery &Query) {
+        return Query.Opcode == G_MUL &&
+               (Query.Types[0] == v4s32 || Query.Types[0] == v8s16 ||
+                Query.Types[0] == v2s64);
+      })
+      .legalFor({s32, s64, v2s32, v2s64, v4s32, v4s16, v8s16, v16s8, v8s8})
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, s64)
       .clampMaxNumElements(0, s8, 16)
@@ -1023,11 +1022,73 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
     return legalizeFCopySign(MI, Helper);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     return legalizeExtractVectorElt(MI, MRI, Helper);
+  case TargetOpcode::G_MUL:
+    return legalizeMULL(MI, MRI, MIRBuilder, Helper);
   }
 
   llvm_unreachable("expected switch to return");
 }
 
+bool AArch64LegalizerInfo::legalizeMULL(MachineInstr &MI,
+                                        MachineRegisterInfo &MRI,
+                                        MachineIRBuilder &MIRBuilder,
+                                        LegalizerHelper &Helper) const {
+  // Get the instruction that defined the source operand
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+  MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+  MachineInstr *I2 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+
+  // If the source operands were EXTENDED before, then UMULL can be used
+  unsigned I1Opcode = I1->getOpcode();
+  unsigned I2Opcode = I2->getOpcode();
+  if (((I1Opcode == TargetOpcode::G_ZEXT && I2Opcode == TargetOpcode::G_ZEXT) ||
+       (I1Opcode == TargetOpcode::G_SEXT &&
+        I2Opcode == TargetOpcode::G_SEXT)) &&
+      (MRI.getType(I1->getOperand(0).getReg()).getScalarSizeInBits() ==
+       MRI.getType(I1->getOperand(1).getReg()).getScalarSizeInBits() * 2) &&
+      (MRI.getType(I2->getOperand(0).getReg()).getScalarSizeInBits() ==
+       MRI.getType(I2->getOperand(1).getReg()).getScalarSizeInBits() * 2)) {
+    // {S/U}MULL{2} of the source of the extend
+    MIRBuilder.buildInstr(
+        I1Opcode == TargetOpcode::G_ZEXT ? AArch64::G_UMULL : AArch64::G_SMULL,
+        {MI.getOperand(0).getReg()},
+        {I1->getOperand(1).getReg(), I2->getOperand(1).getReg()});
+    I1->eraseFromParent();
+    I2->eraseFromParent();
+    MI.eraseFromParent();
+  }
+  // When the destination type is v2s64, scalarize the instruction
+  // Used to be handled in getActionDefinitionsBuilder
+  else if (DstTy.getNumElements() == 2 && DstTy.getScalarSizeInBits() == 64) {
+
+    // If previous instruction is G_{S/Z}EXT followed by G_UNMERGE_VALUES}, DO
+    // NOT SCALARIZE. The Extend instruction should be unmerged then merged,
+    // folding the current unmerge
+    if ((I1Opcode == TargetOpcode::G_UNMERGE_VALUES &&
+         I2Opcode == TargetOpcode::G_UNMERGE_VALUES)) {
+
+      unsigned I1SrcIdx = I1->getNumOperands() - 1;
+      unsigned I2SrcIdx = I2->getNumOperands() - 1;
+
+      I1 = getDefIgnoringCopies(I1->getOperand(I1SrcIdx).getReg(), MRI);
+      I2 = getDefIgnoringCopies(I2->getOperand(I2SrcIdx).getReg(), MRI);
+      I1Opcode = I1->getOpcode();
+      I2Opcode = I2->getOpcode();
+      if ((I1Opcode == TargetOpcode::G_ZEXT &&
+           I2Opcode == TargetOpcode::G_ZEXT) ||
+          (I1Opcode == TargetOpcode::G_SEXT &&
+           I2Opcode == TargetOpcode::G_SEXT)) {
+        return true;
+      }
+    }
+    Helper.fewerElementsVector(
+        MI, 0,
+        DstTy.changeElementCount(
+            DstTy.getElementCount().divideCoefficientBy(2)));
+  }
+  return true;
+}
+
 bool AArch64LegalizerInfo::legalizeFunnelShift(MachineInstr &MI,
                                                MachineRegisterInfo &MRI,
                                                MachineIRBuilder &MIRBuilder,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 853d5a2305ac68..eb89966f0c14e3 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -51,6 +51,9 @@ class AArch64LegalizerInfo : public LegalizerInfo {
                                LegalizerHelper &Helper) const;
   bool legalizeRotate(MachineInstr &MI, MachineRegisterInfo &MRI,
                       LegalizerHelper &Helper) const;
+  bool legalizeMULL(MachineInstr &MI, MachineRegisterInfo &MRI,
+                    MachineIRBuilder &MIRBuilder,
+                    LegalizerHelper &Helper) const;
   bool legalizeFunnelShift(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &MIRBuilder,
                            GISelChangeObserver &Observer,
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index b38868a530264e..e4f1738a8843c2 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -14,20 +14,20 @@
 
 
 # DEBUG:      G_ADD (opcode [[ADD_OPC:[0-9]+]]): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_SUB (opcode [[SUB_OPC:[0-9]+]]): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode [[SUB_OPC]] is aliased to [[ADD_OPC]]
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_MUL (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
-# DEBUG-NEXT: G_SDIV (opcode {{[0-9]+}}): 1 type index
+# DEBUG-NEXT: G_SDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 #
@@ -57,18 +57,18 @@
 #
 # DEBUG-NEXT: G_AND (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_OR (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_XOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. the first uncovered type index: 1, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_IMPLICIT_DEF (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index c1470239995c99..32218e459aecb9 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NEON
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: smull_v8i8_v8i16:
@@ -48,14 +49,36 @@ define <2 x i64> @smull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_v8i8_v8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr q2, [x1]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    smull2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT:    smull v0.4s, v0.4h, v2.4h
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr d0, [x0]
+; CHECK-NEON-NEXT:    ldr q2, [x1]
+; CHECK-NEON-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT:    smull2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT:    smull v0.4s, v0.4h, v2.4h
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr d0, [x0]
+; CHECK-SVE-NEXT:    ldr q2, [x1]
+; CHECK-SVE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT:    smull2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT:    smull v0.4s, v0.4h, v2.4h
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull_zext_v8i8_v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    mul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %load.A = load <8 x i8>, ptr %A
   %load.B = load <8 x i16>, ptr %B
   %zext.A = zext <8 x i8> %load.A to <8 x i32>
@@ -65,14 +88,36 @@ define <8 x i32> @smull_zext_v8i8_v8i32(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ldr q2, [x0]
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    smull2 v1.4s, v2.8h, v0.8h
-; CHECK-NEXT:    smull v0.4s, v2.4h, v0.4h
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr d0, [x1]
+; CHECK-NEON-NEXT:    ldr q2, [x0]
+; CHECK-NEON-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT:    smull2 v1.4s, v2.8h, v0.8h
+; CHECK-NEON-NEXT:    smull v0.4s, v2.4h, v0.4h
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr d0, [x1]
+; CHECK-SVE-NEXT:    ldr q2, [x0]
+; CHECK-SVE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT:    smull2 v1.4s, v2.8h, v0.8h
+; CHECK-SVE-NEXT:    smull v0.4s, v2.4h, v0.4h
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_sext_first_operand:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x1]
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    mul v1.4s, v1.4s, v4.4s
+; CHECK-GI-NEXT:    ret
   %load.A = load <8 x i16>, ptr %A
   %load.B = load <8 x i8>, ptr %B
   %sext.A = sext <8 x i16> %load.A to <8 x i32>
@@ -82,18 +127,46 @@ define <8 x i32> @smull_zext_v8i8_v8i32_sext_first_operand(ptr %A, ptr %B) nounw
 }
 
 define <8 x i32> @smull_zext_v8i8_v8i32_top_bit_is_1(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    orr v0.8h, #128, lsl #8
-; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v2.4s, v0.4h, #0
-; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mul v0.4s, v2.4s, v3.4s
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q0, [x0]
+; CHECK-NEON-NEXT:    ldr q1, [x1]
+; CHECK-NEON-NEXT:    orr v0.8h, #128, lsl #8
+; CHECK-NEON-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-NEON-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-NEON-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-NEON-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-NEON-NEXT:    mul v1.4s, v0.4s, v1.4s
+; CHECK-NEON-NEXT:    mul v0.4s, v2.4s, v3.4s
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q0, [x0]
+; CHECK-SVE-NEXT:    ldr q1, [x1]
+; CHECK-SVE-NEXT:    orr v0.8h, #128, lsl #8
+; CHECK-SVE-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-SVE-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-SVE-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-SVE-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-SVE-NEXT:    mul v1.4s, v0.4s, v1.4s
+; CHECK-SVE-NEXT:    mul v0.4s, v2.4s, v3.4s
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull_zext_v8i8_v8i32_top_bit_is_1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI5_0
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI5_0]
+; CHECK-GI-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    mul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %load.A = load <8 x i16>, ptr %A
   %or.A = or <8 x i16> %load.A, <i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000, i16 u0x8000>
   %load.B = load <8 x i16>, ptr %B
@@ -146,6 +219,21 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull_zext_v2i32_v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x1]
+; CHECK-GI-NEXT:    ldrh w8, [x0]
+; CHECK-GI-NEXT:    ldrh w10, [x0, #2]
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mul x9, x10, x9
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    ret
   %load.A = load <2 x i16>, ptr %A
   %load.B = load <2 x i32>, ptr %B
   %zext.A = zext <2 x i16> %load.A to <2 x i64>
@@ -155,13 +243,42 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @smull_zext_and_v2i32_v2i64(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: smull_zext_and_v2i32_v2i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    bic v0.2s, #128, lsl #24
-; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smull_zext_and_v2i32_v2i64:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr d0, [x0]
+; CHECK-NEON-NEXT:    ldr d1, [x1]
+; CHECK-NEON-NEXT:    bic v0.2s, #128, lsl #24
+; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smull_zext_and_v2i32_v2i64:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr d0, [x0]
+; CHECK-SVE-NEXT:    ldr d1, [x1]
+; CHECK-SVE-NEXT:    bic v0.2s, #128, lsl #24
+; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull_zext_and_v2i32_v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI8_0
+; CHECK-GI-NEXT:    ldr d1, [x0]
+; CHECK-GI-NEXT:    ldr d0, [x8, :lo12:.LCPI8_0]
+; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov x10, d3
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    mul x9, x9, x10
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    ret
   %load.A = load <2 x i32>, ptr %A
   %and.A = and <2 x i32> %load.A, <i32 u0x7FFFFFFF, i32 u0x7FFFFFFF>
   %load.B = load <2 x i32>, ptr %B
@@ -217,13 +334,31 @@ define <2 x i64> @umull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @amull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: amull_v8i8_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amull_v8i8_v8i16:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr d0, [x0]
+; CHECK-NEON-NEXT:    ldr d1, [x1]
+; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amull_v8i8_v8i16:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr d0, [x0]
+; CHECK-SVE-NEXT:    ldr d1, [x1]
+; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amull_v8i8_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI12_0
+; CHECK-GI-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -234,14 +369,33 @@ define <8 x i16> @amull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @amull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: amull_v4i16_v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d2, [x1]
-; CHECK-NEXT:    movi v0.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    smull v1.4s, v1.4h, v2.4h
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amull_v4i16_v4i32:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr d1, [x0]
+; CHECK-NEON-NEXT:    ldr d2, [x1]
+; CHECK-NEON-NEXT:    movi v0.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT:    smull v1.4s, v1.4h, v2.4h
+; CHECK-NEON-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amull_v4i16_v4i32:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr d1, [x0]
+; CHECK-SVE-NEXT:    ldr d2, [x1]
+; CHECK-SVE-NEXT:    movi v0.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT:    smull v1.4s, v1.4h, v2.4h
+; CHECK-SVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amull_v4i16_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI13_0
+; CHECK-GI-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI13_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -252,14 +406,33 @@ define <4 x i32> @amull_v4i16_v4i32(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @amull_v2i32_v2i64(ptr %A, ptr %B) nounwind {
-; CHECK-LABEL: amull_v2i32_v2i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d1, [x0]
-; CHECK-NEXT:    ldr d2, [x1]
-; CHECK-NEXT:    movi v0.2d, #0x000000ffffffff
-; CHECK-NEXT:    smull v1.2d, v1.2s, v2.2s
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amull_v2i32_v2i64:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr d1, [x0]
+; CHECK-NEON-NEXT:    ldr d2, [x1]
+; CHECK-NEON-NEXT:    movi v0.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT:    smull v1.2d, v1.2s, v2.2s
+; CHECK-NEON-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amull_v2i32_v2i64:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr d1, [x0]
+; CHECK-SVE-NEXT:    ldr d2, [x1]
+; CHECK-SVE-NEXT:    movi v0.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT:    smull v1.2d, v1.2s, v2.2s
+; CHECK-SVE-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amull_v2i32_v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
+; CHECK-GI-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -378,14 +551,34 @@ define <2 x i64> @umlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlal_v8i8_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x2]
-; CHECK-NEXT:    smlal v0.8h, v1.8b, v2.8b
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amlal_v8i8_v8i16:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q0, [x0]
+; CHECK-NEON-NEXT:    ldr d1, [x1]
+; CHECK-NEON-NEXT:    ldr d2, [x2]
+; CHECK-NEON-NEXT:    smlal v0.8h, v1.8b, v2.8b
+; CHECK-NEON-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amlal_v8i8_v8i16:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q0, [x0]
+; CHECK-SVE-NEXT:    ldr d1, [x1]
+; CHECK-SVE-NEXT:    ldr d2, [x2]
+; CHECK-SVE-NEXT:    smlal v0.8h, v1.8b, v2.8b
+; CHECK-SVE-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amlal_v8i8_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI21_0
+; CHECK-GI-NEXT:    ldr d2, [x2]
+; CHECK-GI-NEXT:    umlal v0.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI21_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = load <8 x i8>, ptr %C
@@ -398,15 +591,36 @@ define <8 x i16> @amlal_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlal_v4i16_v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x2]
-; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.4h
-; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amlal_v4i16_v4i32:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q0, [x0]
+; CHECK-NEON-NEXT:    ldr d1, [x1]
+; CHECK-NEON-NEXT:    ldr d2, [x2]
+; CHECK-NEON-NEXT:    smlal v0.4s, v1.4h, v2.4h
+; CHECK-NEON-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amlal_v4i16_v4i32:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q0, [x0]
+; CHECK-SVE-NEXT:    ldr d1, [x1]
+; CHECK-SVE-NEXT:    ldr d2, [x2]
+; CHECK-SVE-NEXT:    smlal v0.4s, v1.4h, v2.4h
+; CHECK-SVE-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amlal_v4i16_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI22_0
+; CHECK-GI-NEXT:    ldr d2, [x2]
+; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI22_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = load <4 x i16>, ptr %C
@@ -419,15 +633,36 @@ define <4 x i32> @amlal_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <2 x i64> @amlal_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlal_v2i32_v2i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x2]
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amlal_v2i32_v2i64:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q0, [x0]
+; CHECK-NEON-NEXT:    ldr d1, [x1]
+; CHECK-NEON-NEXT:    ldr d2, [x2]
+; CHECK-NEON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-NEON-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amlal_v2i32_v2i64:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q0, [x0]
+; CHECK-SVE-NEXT:    ldr d1, [x1]
+; CHECK-SVE-NEXT:    ldr d2, [x2]
+; CHECK-SVE-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-SVE-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amlal_v2i32_v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI23_0
+; CHECK-GI-NEXT:    ldr d2, [x2]
+; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI23_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = load <2 x i32>, ptr %C
@@ -548,14 +783,34 @@ define <2 x i64> @umlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlsl_v8i8_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x2]
-; CHECK-NEXT:    smlsl v0.8h, v1.8b, v2.8b
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amlsl_v8i8_v8i16:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q0, [x0]
+; CHECK-NEON-NEXT:    ldr d1, [x1]
+; CHECK-NEON-NEXT:    ldr d2, [x2]
+; CHECK-NEON-NEXT:    smlsl v0.8h, v1.8b, v2.8b
+; CHECK-NEON-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amlsl_v8i8_v8i16:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q0, [x0]
+; CHECK-SVE-NEXT:    ldr d1, [x1]
+; CHECK-SVE-NEXT:    ldr d2, [x2]
+; CHECK-SVE-NEXT:    smlsl v0.8h, v1.8b, v2.8b
+; CHECK-SVE-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amlsl_v8i8_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI30_0
+; CHECK-GI-NEXT:    ldr d2, [x2]
+; CHECK-GI-NEXT:    umlsl v0.8h, v1.8b, v2.8b
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI30_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <8 x i16>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = load <8 x i8>, ptr %C
@@ -568,15 +823,36 @@ define <8 x i16> @amlsl_v8i8_v8i16(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlsl_v4i16_v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x2]
-; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.4h
-; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amlsl_v4i16_v4i32:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q0, [x0]
+; CHECK-NEON-NEXT:    ldr d1, [x1]
+; CHECK-NEON-NEXT:    ldr d2, [x2]
+; CHECK-NEON-NEXT:    smlsl v0.4s, v1.4h, v2.4h
+; CHECK-NEON-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amlsl_v4i16_v4i32:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q0, [x0]
+; CHECK-SVE-NEXT:    ldr d1, [x1]
+; CHECK-SVE-NEXT:    ldr d2, [x2]
+; CHECK-SVE-NEXT:    smlsl v0.4s, v1.4h, v2.4h
+; CHECK-SVE-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amlsl_v4i16_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI31_0
+; CHECK-GI-NEXT:    ldr d2, [x2]
+; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI31_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <4 x i32>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = load <4 x i16>, ptr %C
@@ -589,15 +865,36 @@ define <4 x i32> @amlsl_v4i16_v4i32(ptr %A, ptr %B, ptr %C) nounwind {
 }
 
 define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
-; CHECK-LABEL: amlsl_v2i32_v2i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr q0, [x0]
-; CHECK-NEXT:    ldr d1, [x1]
-; CHECK-NEXT:    ldr d2, [x2]
-; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amlsl_v2i32_v2i64:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    ldr q0, [x0]
+; CHECK-NEON-NEXT:    ldr d1, [x1]
+; CHECK-NEON-NEXT:    ldr d2, [x2]
+; CHECK-NEON-NEXT:    smlsl v0.2d, v1.2s, v2.2s
+; CHECK-NEON-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amlsl_v2i32_v2i64:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    ldr q0, [x0]
+; CHECK-SVE-NEXT:    ldr d1, [x1]
+; CHECK-SVE-NEXT:    ldr d2, [x2]
+; CHECK-SVE-NEXT:    smlsl v0.2d, v1.2s, v2.2s
+; CHECK-SVE-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amlsl_v2i32_v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI32_0
+; CHECK-GI-NEXT:    ldr d2, [x2]
+; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI32_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp1 = load <2 x i64>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = load <2 x i32>, ptr %C
@@ -611,11 +908,25 @@ define <2 x i64> @amlsl_v2i32_v2i64(ptr %A, ptr %B, ptr %C) nounwind {
 
 ; SMULL recognizing BUILD_VECTORs with sign/zero-extended elements.
 define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
-; CHECK-LABEL: smull_extvec_v8i8_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8b, #244
-; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    movi v1.8b, #244
+; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    movi v1.8b, #244
+; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull_extvec_v8i8_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI33_0
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI33_0]
+; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %tmp3 = sext <8 x i8> %arg to <8 x i16>
   %tmp4 = mul <8 x i16> %tmp3, <i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12, i16 -12>
   ret <8 x i16> %tmp4
@@ -623,47 +934,111 @@ define <8 x i16> @smull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
 
 define <8 x i16> @smull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
-; CHECK-LABEL: smull_noextvec_v8i8_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #64537 // =0xfc19
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    mov w8, #64537 // =0xfc19
+; CHECK-NEON-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT:    dup v1.8h, w8
+; CHECK-NEON-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov w8, #64537 // =0xfc19
+; CHECK-SVE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT:    dup v1.8h, w8
+; CHECK-SVE-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull_noextvec_v8i8_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI34_0
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI34_0]
+; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %tmp3 = sext <8 x i8> %arg to <8 x i16>
   %tmp4 = mul <8 x i16> %tmp3, <i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999, i16 -999>
   ret <8 x i16> %tmp4
 }
 
 define <4 x i32> @smull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
-; CHECK-LABEL: smull_extvec_v4i16_v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mvni v1.4h, #11
-; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    mvni v1.4h, #11
+; CHECK-NEON-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mvni v1.4h, #11
+; CHECK-SVE-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull_extvec_v4i16_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI35_0
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI35_0]
+; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp3 = sext <4 x i16> %arg to <4 x i32>
   %tmp4 = mul <4 x i32> %tmp3, <i32 -12, i32 -12, i32 -12, i32 -12>
   ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @smull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
-; CHECK-LABEL: smull_extvec_v2i32_v2i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #-1234 // =0xfffffb2e
-; CHECK-NEXT:    dup v1.2s, w8
-; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smull_extvec_v2i32_v2i64:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    mov w8, #-1234 // =0xfffffb2e
+; CHECK-NEON-NEXT:    dup v1.2s, w8
+; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smull_extvec_v2i32_v2i64:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov w8, #-1234 // =0xfffffb2e
+; CHECK-SVE-NEXT:    dup v1.2s, w8
+; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull_extvec_v2i32_v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    mov x8, #-1234 // =0xfffffffffffffb2e
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mul x9, x9, x8
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mul x8, x10, x8
+; CHECK-GI-NEXT:    fmov d0, x9
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
   %tmp3 = sext <2 x i32> %arg to <2 x i64>
   %tmp4 = mul <2 x i64> %tmp3, <i64 -1234, i64 -1234>
   ret <2 x i64> %tmp4
 }
 
 define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
-; CHECK-LABEL: umull_extvec_v8i8_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8b, #12
-; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    movi v1.8b, #12
+; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    movi v1.8b, #12
+; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_extvec_v8i8_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI37_0
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI37_0]
+; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %tmp3 = zext <8 x i8> %arg to <8 x i16>
   %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
   ret <8 x i16> %tmp4
@@ -671,49 +1046,118 @@ define <8 x i16> @umull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
 
 define <8 x i16> @umull_noextvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
 ; Do not use SMULL if the BUILD_VECTOR element values are too big.
-; CHECK-LABEL: umull_noextvec_v8i8_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #999 // =0x3e7
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    dup v1.8h, w8
-; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    mov w8, #999 // =0x3e7
+; CHECK-NEON-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT:    dup v1.8h, w8
+; CHECK-NEON-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov w8, #999 // =0x3e7
+; CHECK-SVE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT:    dup v1.8h, w8
+; CHECK-SVE-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_noextvec_v8i8_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI38_0
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI38_0]
+; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
   %tmp3 = zext <8 x i8> %arg to <8 x i16>
   %tmp4 = mul <8 x i16> %tmp3, <i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999, i16 999>
   ret <8 x i16> %tmp4
 }
 
 define <4 x i32> @umull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
-; CHECK-LABEL: umull_extvec_v4i16_v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1234 // =0x4d2
-; CHECK-NEXT:    dup v1.4h, w8
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    mov w8, #1234 // =0x4d2
+; CHECK-NEON-NEXT:    dup v1.4h, w8
+; CHECK-NEON-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov w8, #1234 // =0x4d2
+; CHECK-SVE-NEXT:    dup v1.4h, w8
+; CHECK-SVE-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_extvec_v4i16_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI39_0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI39_0]
+; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
   %tmp3 = zext <4 x i16> %arg to <4 x i32>
   %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
   ret <4 x i32> %tmp4
 }
 
 define <2 x i64> @umull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
-; CHECK-LABEL: umull_extvec_v2i32_v2i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1234 // =0x4d2
-; CHECK-NEXT:    dup v1.2s, w8
-; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    mov w8, #1234 // =0x4d2
+; CHECK-NEON-NEXT:    dup v1.2s, w8
+; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov w8, #1234 // =0x4d2
+; CHECK-SVE-NEXT:    dup v1.2s, w8
+; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_extvec_v2i32_v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    mov w8, #1234 // =0x4d2
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mul x9, x9, x8
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mul x8, x10, x8
+; CHECK-GI-NEXT:    fmov d0, x9
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    ret
   %tmp3 = zext <2 x i32> %arg to <2 x i64>
   %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
   ret <2 x i64> %tmp4
 }
 
 define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
-; CHECK-LABEL: amull_extvec_v8i8_v8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8b, #12
-; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    bic v0.8h, #255, lsl #8
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amull_extvec_v8i8_v8i16:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    movi v1.8b, #12
+; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amull_extvec_v8i8_v8i16:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    movi v1.8b, #12
+; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT:    bic v0.8h, #255, lsl #8
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amull_extvec_v8i8_v8i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI41_1
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI41_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI41_0
+; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI41_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp3 = zext <8 x i8> %arg to <8 x i16>
   %tmp4 = mul <8 x i16> %tmp3, <i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12, i16 12>
   %and = and <8 x i16> %tmp4, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -721,14 +1165,34 @@ define <8 x i16> @amull_extvec_v8i8_v8i16(<8 x i8> %arg) nounwind {
 }
 
 define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
-; CHECK-LABEL: amull_extvec_v4i16_v4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1234 // =0x4d2
-; CHECK-NEXT:    dup v1.4h, w8
-; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amull_extvec_v4i16_v4i32:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    mov w8, #1234 // =0x4d2
+; CHECK-NEON-NEXT:    dup v1.4h, w8
+; CHECK-NEON-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amull_extvec_v4i16_v4i32:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov w8, #1234 // =0x4d2
+; CHECK-SVE-NEXT:    dup v1.4h, w8
+; CHECK-SVE-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT:    movi v1.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amull_extvec_v4i16_v4i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adrp x8, .LCPI42_1
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI42_1]
+; CHECK-GI-NEXT:    adrp x8, .LCPI42_0
+; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI42_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp3 = zext <4 x i16> %arg to <4 x i32>
   %tmp4 = mul <4 x i32> %tmp3, <i32 1234, i32 1234, i32 1234, i32 1234>
   %and = and <4 x i32> %tmp4, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -736,14 +1200,39 @@ define <4 x i32> @amull_extvec_v4i16_v4i32(<4 x i16> %arg) nounwind {
 }
 
 define <2 x i64> @amull_extvec_v2i32_v2i64(<2 x i32> %arg) nounwind {
-; CHECK-LABEL: amull_extvec_v2i32_v2i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #1234 // =0x4d2
-; CHECK-NEXT:    dup v1.2s, w8
-; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amull_extvec_v2i32_v2i64:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    mov w8, #1234 // =0x4d2
+; CHECK-NEON-NEXT:    dup v1.2s, w8
+; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amull_extvec_v2i32_v2i64:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    mov w8, #1234 // =0x4d2
+; CHECK-SVE-NEXT:    dup v1.2s, w8
+; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT:    movi v1.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amull_extvec_v2i32_v2i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    mov w8, #1234 // =0x4d2
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mul x9, x9, x8
+; CHECK-GI-NEXT:    fmov x10, d1
+; CHECK-GI-NEXT:    mul x8, x10, x8
+; CHECK-GI-NEXT:    fmov d0, x9
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    adrp x8, .LCPI43_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI43_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
   %tmp3 = zext <2 x i32> %arg to <2 x i64>
   %tmp4 = mul <2 x i64> %tmp3, <i64 1234, i64 1234>
   %and = and <2 x i64> %tmp4, <i64 4294967295, i64 4294967295>
@@ -788,15 +1277,36 @@ ret <8 x i16> %3
 }
 
 define void @distribute(ptr %dst, ptr %src, i32 %mul) nounwind {
-; CHECK-LABEL: distribute:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    dup v1.8b, w2
-; CHECK-NEXT:    mov d2, v0.d[1]
-; CHECK-NEXT:    umull v2.8h, v2.8b, v1.8b
-; CHECK-NEXT:    umlal v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    str q2, [x0]
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: distribute:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    ldr q0, [x1]
+; CHECK-NEON-NEXT:    dup v1.8b, w2
+; CHECK-NEON-NEXT:    mov d2, v0.d[1]
+; CHECK-NEON-NEXT:    umull v2.8h, v2.8b, v1.8b
+; CHECK-NEON-NEXT:    umlal v2.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT:    str q2, [x0]
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: distribute:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    ldr q0, [x1]
+; CHECK-SVE-NEXT:    dup v1.8b, w2
+; CHECK-SVE-NEXT:    mov d2, v0.d[1]
+; CHECK-SVE-NEXT:    umull v2.8h, v2.8b, v1.8b
+; CHECK-SVE-NEXT:    umlal v2.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT:    str q2, [x0]
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: distribute:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr q0, [x1]
+; CHECK-GI-NEXT:    dup v2.8b, w2
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    uaddl v0.8h, v1.8b, v0.8b
+; CHECK-GI-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    str q0, [x0]
+; CHECK-GI-NEXT:    ret
 entry:
   %0 = trunc i32 %mul to i8
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
@@ -817,12 +1327,26 @@ entry:
 }
 
 define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
-; CHECK-LABEL: umull2_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    umull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull2_i8:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull2_i8:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT:    mov v1.16b, v2.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull2_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    umull v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    umull2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
   %arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
   %mul = mul <16 x i16> %arg1_ext, %arg2_ext
@@ -830,12 +1354,26 @@ define <16 x i16> @umull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
 }
 
 define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
-; CHECK-LABEL: smull2_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    smull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smull2_i8:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smull2_i8:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    smull2 v2.8h, v0.16b, v1.16b
+; CHECK-SVE-NEXT:    smull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT:    mov v1.16b, v2.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull2_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    smull v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    smull2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %arg1_ext = sext <16 x i8> %arg1 to <16 x i16>
   %arg2_ext = sext <16 x i8> %arg2 to <16 x i16>
   %mul = mul <16 x i16> %arg1_ext, %arg2_ext
@@ -843,12 +1381,26 @@ define <16 x i16> @smull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
 }
 
 define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
-; CHECK-LABEL: umull2_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    umull2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull2_i16:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-NEON-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull2_i16:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    umull2 v2.4s, v0.8h, v1.8h
+; CHECK-SVE-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT:    mov v1.16b, v2.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull2_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    umull v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umull2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
   %arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
   %mul = mul <8 x i32> %arg1_ext, %arg2_ext
@@ -856,12 +1408,26 @@ define <8 x i32> @umull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
 }
 
 define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
-; CHECK-LABEL: smull2_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    smull2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smull2_i16:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; CHECK-NEON-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smull2_i16:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    smull2 v2.4s, v0.8h, v1.8h
+; CHECK-SVE-NEXT:    smull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT:    mov v1.16b, v2.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull2_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    smull v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    smull2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %arg1_ext = sext <8 x i16> %arg1 to <8 x i32>
   %arg2_ext = sext <8 x i16> %arg2 to <8 x i32>
   %mul = mul <8 x i32> %arg1_ext, %arg2_ext
@@ -869,12 +1435,26 @@ define <8 x i32> @smull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
 }
 
 define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
-; CHECK-LABEL: umull2_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    umull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull2_i32:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull2_i32:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    umull2 v2.2d, v0.4s, v1.4s
+; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT:    mov v1.16b, v2.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull2_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    umull v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    umull2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
   %arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
   %mul = mul <4 x i64> %arg1_ext, %arg2_ext
@@ -882,12 +1462,26 @@ define <4 x i64> @umull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
 }
 
 define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
-; CHECK-LABEL: smull2_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    smull2 v2.2d, v0.4s, v1.4s
-; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: smull2_i32:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT:    mov v1.16b, v2.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: smull2_i32:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    smull2 v2.2d, v0.4s, v1.4s
+; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT:    mov v1.16b, v2.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: smull2_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    smull v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    smull2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
   %arg1_ext = sext <4 x i32> %arg1 to <4 x i64>
   %arg2_ext = sext <4 x i32> %arg2 to <4 x i64>
   %mul = mul <4 x i64> %arg1_ext, %arg2_ext
@@ -895,14 +1489,33 @@ define <4 x i64> @smull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
 }
 
 define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
-; CHECK-LABEL: amull2_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    smull v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    smull2 v1.8h, v0.16b, v1.16b
-; CHECK-NEXT:    bic v2.8h, #255, lsl #8
-; CHECK-NEXT:    bic v1.8h, #255, lsl #8
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amull2_i8:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    smull v2.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT:    smull2 v1.8h, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    bic v2.8h, #255, lsl #8
+; CHECK-NEON-NEXT:    bic v1.8h, #255, lsl #8
+; CHECK-NEON-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amull2_i8:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    smull v2.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT:    smull2 v1.8h, v0.16b, v1.16b
+; CHECK-SVE-NEXT:    bic v2.8h, #255, lsl #8
+; CHECK-SVE-NEXT:    bic v1.8h, #255, lsl #8
+; CHECK-SVE-NEXT:    mov v0.16b, v2.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amull2_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    umull v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    umull2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    adrp x8, .LCPI53_0
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI53_0]
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %arg1_ext = zext <16 x i8> %arg1 to <16 x i16>
   %arg2_ext = zext <16 x i8> %arg2 to <16 x i16>
   %mul = mul <16 x i16> %arg1_ext, %arg2_ext
@@ -911,14 +1524,33 @@ define <16 x i16> @amull2_i8(<16 x i8> %arg1, <16 x i8> %arg2) {
 }
 
 define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
-; CHECK-LABEL: amull2_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    smull v3.4s, v0.4h, v1.4h
-; CHECK-NEXT:    smull2 v0.4s, v0.8h, v1.8h
-; CHECK-NEXT:    and v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v0.16b, v3.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amull2_i16:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    movi v2.2d, #0x00ffff0000ffff
+; CHECK-NEON-NEXT:    smull v3.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-NEON-NEXT:    and v1.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    and v0.16b, v3.16b, v2.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amull2_i16:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    movi v2.2d, #0x00ffff0000ffff
+; CHECK-SVE-NEXT:    smull v3.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-SVE-NEXT:    and v1.16b, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    and v0.16b, v3.16b, v2.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amull2_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    umull v2.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umull2 v1.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    adrp x8, .LCPI54_0
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI54_0]
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %arg1_ext = zext <8 x i16> %arg1 to <8 x i32>
   %arg2_ext = zext <8 x i16> %arg2 to <8 x i32>
   %mul = mul <8 x i32> %arg1_ext, %arg2_ext
@@ -927,14 +1559,33 @@ define <8 x i32> @amull2_i16(<8 x i16> %arg1, <8 x i16> %arg2) {
 }
 
 define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
-; CHECK-LABEL: amull2_i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
-; CHECK-NEXT:    smull v3.2d, v0.2s, v1.2s
-; CHECK-NEXT:    smull2 v0.2d, v0.4s, v1.4s
-; CHECK-NEXT:    and v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v0.16b, v3.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: amull2_i32:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    movi v2.2d, #0x000000ffffffff
+; CHECK-NEON-NEXT:    smull v3.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT:    smull2 v0.2d, v0.4s, v1.4s
+; CHECK-NEON-NEXT:    and v1.16b, v0.16b, v2.16b
+; CHECK-NEON-NEXT:    and v0.16b, v3.16b, v2.16b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: amull2_i32:
+; CHECK-SVE:       // %bb.0:
+; CHECK-SVE-NEXT:    movi v2.2d, #0x000000ffffffff
+; CHECK-SVE-NEXT:    smull v3.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT:    smull2 v0.2d, v0.4s, v1.4s
+; CHECK-SVE-NEXT:    and v1.16b, v0.16b, v2.16b
+; CHECK-SVE-NEXT:    and v0.16b, v3.16b, v2.16b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: amull2_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    umull v2.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    umull2 v1.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    adrp x8, .LCPI55_0
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI55_0]
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
   %arg1_ext = zext <4 x i32> %arg1 to <4 x i64>
   %arg2_ext = zext <4 x i32> %arg2 to <4 x i64>
   %mul = mul <4 x i64> %arg1_ext, %arg2_ext
@@ -944,12 +1595,28 @@ define <4 x i64> @amull2_i32(<4 x i32> %arg1, <4 x i32> %arg2) {
 
 
 define <8 x i16> @umull_and_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_and_v8i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v1.8h, #255, lsl #8
-; CHECK-NEXT:    xtn v1.8b, v1.8h
-; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_and_v8i16:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    bic v1.8h, #255, lsl #8
+; CHECK-NEON-NEXT:    xtn v1.8b, v1.8h
+; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_and_v8i16:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    bic v1.8h, #255, lsl #8
+; CHECK-SVE-NEXT:    xtn v1.8b, v1.8h
+; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_and_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI56_0
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI56_0]
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <8 x i8> %src1 to <8 x i16>
   %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -958,12 +1625,28 @@ entry:
 }
 
 define <8 x i16> @umull_and_v8i16_c(<8 x i8> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_and_v8i16_c:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bic v1.8h, #255, lsl #8
-; CHECK-NEXT:    xtn v1.8b, v1.8h
-; CHECK-NEXT:    umull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_and_v8i16_c:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    bic v1.8h, #255, lsl #8
+; CHECK-NEON-NEXT:    xtn v1.8b, v1.8h
+; CHECK-NEON-NEXT:    umull v0.8h, v1.8b, v0.8b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_and_v8i16_c:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    bic v1.8h, #255, lsl #8
+; CHECK-SVE-NEXT:    xtn v1.8b, v1.8h
+; CHECK-SVE-NEXT:    umull v0.8h, v1.8b, v0.8b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_and_v8i16_c:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI57_0
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI57_0]
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    mul v0.8h, v1.8h, v0.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <8 x i8> %src1 to <8 x i16>
   %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -972,13 +1655,30 @@ entry:
 }
 
 define <8 x i16> @umull_and256_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_and256_v8i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v2.8h, #1, lsl #8
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_and256_v8i16:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    movi v2.8h, #1, lsl #8
+; CHECK-NEON-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_and256_v8i16:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    movi v2.8h, #1, lsl #8
+; CHECK-SVE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SVE-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_and256_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI58_0
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI58_0]
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <8 x i8> %src1 to <8 x i16>
   %in2 = and <8 x i16> %src2, <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>
@@ -987,11 +1687,25 @@ entry:
 }
 
 define <8 x i16> @umull_andconst_v8i16(<8 x i8> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_andconst_v8i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_andconst_v8i16:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_andconst_v8i16:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    movi v1.2d, #0xffffffffffffffff
+; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_andconst_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI59_0
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI59_0]
+; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <8 x i8> %src1 to <8 x i16>
   %out = mul nsw <8 x i16> %in1, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -999,14 +1713,35 @@ entry:
 }
 
 define <8 x i16> @umull_smaller_v8i16(<8 x i4> %src1, <8 x i16> %src2) {
-; CHECK-LABEL: umull_smaller_v8i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v2.8b, #15
-; CHECK-NEXT:    bic v1.8h, #255, lsl #8
-; CHECK-NEXT:    xtn v1.8b, v1.8h
-; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_smaller_v8i16:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    movi v2.8b, #15
+; CHECK-NEON-NEXT:    bic v1.8h, #255, lsl #8
+; CHECK-NEON-NEXT:    xtn v1.8b, v1.8h
+; CHECK-NEON-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEON-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_smaller_v8i16:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    movi v2.8b, #15
+; CHECK-SVE-NEXT:    bic v1.8h, #255, lsl #8
+; CHECK-SVE-NEXT:    xtn v1.8b, v1.8h
+; CHECK-SVE-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-SVE-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_smaller_v8i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI60_1
+; CHECK-GI-NEXT:    adrp x9, .LCPI60_0
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI60_1]
+; CHECK-GI-NEXT:    ldr q3, [x9, :lo12:.LCPI60_0]
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    mul v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <8 x i4> %src1 to <8 x i16>
   %in2 = and <8 x i16> %src2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -1015,13 +1750,30 @@ entry:
 }
 
 define <4 x i32> @umull_and_v4i32(<4 x i16> %src1, <4 x i32> %src2) {
-; CHECK-LABEL: umull_and_v4i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v2.2d, #0x0000ff000000ff
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_and_v4i32:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    movi v2.2d, #0x0000ff000000ff
+; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT:    xtn v1.4h, v1.4s
+; CHECK-NEON-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_and_v4i32:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    movi v2.2d, #0x0000ff000000ff
+; CHECK-SVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SVE-NEXT:    xtn v1.4h, v1.4s
+; CHECK-SVE-NEXT:    umull v0.4s, v0.4h, v1.4h
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_and_v4i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI61_0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI61_0]
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <4 x i16> %src1 to <4 x i32>
   %in2 = and <4 x i32> %src2, <i32 255, i32 255, i32 255, i32 255>
@@ -1030,15 +1782,37 @@ entry:
 }
 
 define <8 x i32> @umull_and_v8i32(<8 x i16> %src1, <8 x i32> %src2) {
-; CHECK-LABEL: umull_and_v8i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v3.2d, #0x0000ff000000ff
-; CHECK-NEXT:    and v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    uzp1 v2.8h, v1.8h, v2.8h
-; CHECK-NEXT:    umull2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT:    umull v0.4s, v0.4h, v2.4h
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_and_v8i32:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    movi v3.2d, #0x0000ff000000ff
+; CHECK-NEON-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT:    uzp1 v2.8h, v1.8h, v2.8h
+; CHECK-NEON-NEXT:    umull2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT:    umull v0.4s, v0.4h, v2.4h
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_and_v8i32:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    movi v3.2d, #0x0000ff000000ff
+; CHECK-SVE-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-SVE-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-SVE-NEXT:    uzp1 v2.8h, v1.8h, v2.8h
+; CHECK-SVE-NEXT:    umull2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT:    umull v0.4s, v0.4h, v2.4h
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_and_v8i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI62_0
+; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI62_0]
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    and v1.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    mul v0.4s, v4.4s, v0.4s
+; CHECK-GI-NEXT:    mul v1.4s, v5.4s, v1.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <8 x i16> %src1 to <8 x i32>
   %in2 = and <8 x i32> %src2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -1047,13 +1821,31 @@ entry:
 }
 
 define <8 x i32> @umull_and_v8i32_dup(<8 x i16> %src1, i32 %src2) {
-; CHECK-LABEL: umull_and_v8i32_dup:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    dup v2.8h, w8
-; CHECK-NEXT:    umull2 v1.4s, v0.8h, v2.8h
-; CHECK-NEXT:    umull v0.4s, v0.4h, v2.4h
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_and_v8i32_dup:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    and w8, w0, #0xff
+; CHECK-NEON-NEXT:    dup v2.8h, w8
+; CHECK-NEON-NEXT:    umull2 v1.4s, v0.8h, v2.8h
+; CHECK-NEON-NEXT:    umull v0.4s, v0.4h, v2.4h
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_and_v8i32_dup:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    and w8, w0, #0xff
+; CHECK-SVE-NEXT:    dup v2.8h, w8
+; CHECK-SVE-NEXT:    umull2 v1.4s, v0.8h, v2.8h
+; CHECK-SVE-NEXT:    umull v0.4s, v0.4h, v2.4h
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_and_v8i32_dup:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and w8, w0, #0xff
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-GI-NEXT:    dup v3.4s, w8
+; CHECK-GI-NEXT:    mul v0.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    mul v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <8 x i16> %src1 to <8 x i32>
   %in2 = and i32 %src2, 255
@@ -1064,13 +1856,39 @@ entry:
 }
 
 define <2 x i64> @umull_and_v2i64(<2 x i32> %src1, <2 x i64> %src2) {
-; CHECK-LABEL: umull_and_v2i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v2.2d, #0x000000000000ff
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_and_v2i64:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    movi v2.2d, #0x000000000000ff
+; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEON-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_and_v2i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    movi v2.2d, #0x000000000000ff
+; CHECK-SVE-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-SVE-NEXT:    xtn v1.2s, v1.2d
+; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v1.2s
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_and_v2i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI64_0
+; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI64_0]
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    fmov x10, d3
+; CHECK-GI-NEXT:    mul x9, x9, x10
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <2 x i32> %src1 to <2 x i64>
   %in2 = and <2 x i64> %src2, <i64 255, i64 255>
@@ -1079,15 +1897,55 @@ entry:
 }
 
 define <4 x i64> @umull_and_v4i64(<4 x i32> %src1, <4 x i64> %src2) {
-; CHECK-LABEL: umull_and_v4i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v3.2d, #0x000000000000ff
-; CHECK-NEXT:    and v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    uzp1 v2.4s, v1.4s, v2.4s
-; CHECK-NEXT:    umull2 v1.2d, v0.4s, v2.4s
-; CHECK-NEXT:    umull v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_and_v4i64:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    movi v3.2d, #0x000000000000ff
+; CHECK-NEON-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-NEON-NEXT:    uzp1 v2.4s, v1.4s, v2.4s
+; CHECK-NEON-NEXT:    umull2 v1.2d, v0.4s, v2.4s
+; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v2.2s
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_and_v4i64:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    movi v3.2d, #0x000000000000ff
+; CHECK-SVE-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-SVE-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-SVE-NEXT:    uzp1 v2.4s, v1.4s, v2.4s
+; CHECK-SVE-NEXT:    umull2 v1.2d, v0.4s, v2.4s
+; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v2.2s
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_and_v4i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI65_0
+; CHECK-GI-NEXT:    ushll v4.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    ldr q3, [x8, :lo12:.LCPI65_0]
+; CHECK-GI-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v3.16b
+; CHECK-GI-NEXT:    fmov x8, d4
+; CHECK-GI-NEXT:    mov d3, v4.d[1]
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    mov d4, v1.d[1]
+; CHECK-GI-NEXT:    fmov x10, d2
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov d0, v2.d[1]
+; CHECK-GI-NEXT:    fmov x11, d4
+; CHECK-GI-NEXT:    mul x9, x9, x10
+; CHECK-GI-NEXT:    fmov x10, d3
+; CHECK-GI-NEXT:    fmov x12, d0
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mul x11, x11, x12
+; CHECK-GI-NEXT:    mov v0.d[1], x10
+; CHECK-GI-NEXT:    mov v1.d[1], x11
+; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <4 x i32> %src1 to <4 x i64>
   %in2 = and <4 x i64> %src2, <i64 255, i64 255, i64 255, i64 255>
@@ -1096,13 +1954,46 @@ entry:
 }
 
 define <4 x i64> @umull_and_v4i64_dup(<4 x i32> %src1, i64 %src2) {
-; CHECK-LABEL: umull_and_v4i64_dup:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and w8, w0, #0xff
-; CHECK-NEXT:    dup v2.4s, w8
-; CHECK-NEXT:    umull2 v1.2d, v0.4s, v2.4s
-; CHECK-NEXT:    umull v0.2d, v0.2s, v2.2s
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: umull_and_v4i64_dup:
+; CHECK-NEON:       // %bb.0: // %entry
+; CHECK-NEON-NEXT:    and w8, w0, #0xff
+; CHECK-NEON-NEXT:    dup v2.4s, w8
+; CHECK-NEON-NEXT:    umull2 v1.2d, v0.4s, v2.4s
+; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v2.2s
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-SVE-LABEL: umull_and_v4i64_dup:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    and w8, w0, #0xff
+; CHECK-SVE-NEXT:    dup v2.4s, w8
+; CHECK-SVE-NEXT:    umull2 v1.2d, v0.4s, v2.4s
+; CHECK-SVE-NEXT:    umull v0.2d, v0.2s, v2.2s
+; CHECK-SVE-NEXT:    ret
+;
+; CHECK-GI-LABEL: umull_and_v4i64_dup:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    and x8, x0, #0xff
+; CHECK-GI-NEXT:    ushll v1.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-GI-NEXT:    dup v2.2d, x8
+; CHECK-GI-NEXT:    mov d3, v1.d[1]
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    mov d1, v2.d[1]
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mul x8, x8, x9
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    fmov x12, d2
+; CHECK-GI-NEXT:    mul x9, x10, x9
+; CHECK-GI-NEXT:    fmov x10, d3
+; CHECK-GI-NEXT:    mul x10, x10, x11
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    mul x11, x12, x11
+; CHECK-GI-NEXT:    fmov d1, x9
+; CHECK-GI-NEXT:    mov v0.d[1], x10
+; CHECK-GI-NEXT:    mov v1.d[1], x11
+; CHECK-GI-NEXT:    ret
 entry:
   %in1 = zext <4 x i32> %src1 to <4 x i64>
   %in2 = and i64 %src2, 255
diff --git a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
index a5154641400309..4c0d1efb99498f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
@@ -1,15 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple aarch64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple aarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK-GI
 
 ; Tests for wider-than-legal extensions into mul/mla.
 
 define <16 x i16> @mul_i16(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: mul_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    umull2 v2.8h, v0.16b, v1.16b
-; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umull2 v2.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    umull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    umull v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    umull2 v1.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %ea = zext <16 x i8> %a to <16 x i16>
   %eb = zext <16 x i8> %b to <16 x i16>
@@ -18,17 +26,29 @@ entry:
 }
 
 define <16 x i32> @mul_i32(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: mul_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v4.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v5.8h, v0.16b, #0
-; CHECK-NEXT:    ushll2 v6.8h, v1.16b, #0
-; CHECK-NEXT:    umull v0.4s, v2.4h, v4.4h
-; CHECK-NEXT:    umull2 v1.4s, v2.8h, v4.8h
-; CHECK-NEXT:    umull2 v3.4s, v5.8h, v6.8h
-; CHECK-NEXT:    umull v2.4s, v5.4h, v6.4h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v4.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll2 v5.8h, v0.16b, #0
+; CHECK-SD-NEXT:    ushll2 v6.8h, v1.16b, #0
+; CHECK-SD-NEXT:    umull v0.4s, v2.4h, v4.4h
+; CHECK-SD-NEXT:    umull2 v1.4s, v2.8h, v4.8h
+; CHECK-SD-NEXT:    umull2 v3.4s, v5.8h, v6.8h
+; CHECK-SD-NEXT:    umull v2.4s, v5.4h, v6.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v4.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll2 v5.8h, v1.16b, #0
+; CHECK-GI-NEXT:    umull v0.4s, v2.4h, v3.4h
+; CHECK-GI-NEXT:    umull2 v1.4s, v2.8h, v3.8h
+; CHECK-GI-NEXT:    umull v2.4s, v4.4h, v5.4h
+; CHECK-GI-NEXT:    umull2 v3.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    ret
 entry:
   %ea = zext <16 x i8> %a to <16 x i32>
   %eb = zext <16 x i8> %b to <16 x i32>
@@ -37,29 +57,53 @@ entry:
 }
 
 define <16 x i64> @mul_i64(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: mul_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-NEXT:    ushll v5.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v6.4s, v3.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    ushll v16.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v7.4s, v3.8h, #0
-; CHECK-NEXT:    ushll2 v17.4s, v0.8h, #0
-; CHECK-NEXT:    ushll2 v18.4s, v1.8h, #0
-; CHECK-NEXT:    umull2 v1.2d, v4.4s, v6.4s
-; CHECK-NEXT:    umull v0.2d, v4.2s, v6.2s
-; CHECK-NEXT:    umull2 v3.2d, v2.4s, v7.4s
-; CHECK-NEXT:    umull v2.2d, v2.2s, v7.2s
-; CHECK-NEXT:    umull v4.2d, v5.2s, v16.2s
-; CHECK-NEXT:    umull2 v7.2d, v17.4s, v18.4s
-; CHECK-NEXT:    umull2 v5.2d, v5.4s, v16.4s
-; CHECK-NEXT:    umull v6.2d, v17.2s, v18.2s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-SD-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-SD-NEXT:    ushll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v6.4s, v3.4h, #0
+; CHECK-SD-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-SD-NEXT:    ushll v16.4s, v1.4h, #0
+; CHECK-SD-NEXT:    ushll2 v7.4s, v3.8h, #0
+; CHECK-SD-NEXT:    ushll2 v17.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll2 v18.4s, v1.8h, #0
+; CHECK-SD-NEXT:    umull2 v1.2d, v4.4s, v6.4s
+; CHECK-SD-NEXT:    umull v0.2d, v4.2s, v6.2s
+; CHECK-SD-NEXT:    umull2 v3.2d, v2.4s, v7.4s
+; CHECK-SD-NEXT:    umull v2.2d, v2.2s, v7.2s
+; CHECK-SD-NEXT:    umull v4.2d, v5.2s, v16.2s
+; CHECK-SD-NEXT:    umull2 v7.2d, v17.4s, v18.4s
+; CHECK-SD-NEXT:    umull2 v5.2d, v5.4s, v16.4s
+; CHECK-SD-NEXT:    umull v6.2d, v17.2s, v18.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v6.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll2 v16.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v17.4s, v1.8h, #0
+; CHECK-GI-NEXT:    umull v0.2d, v4.2s, v2.2s
+; CHECK-GI-NEXT:    umull2 v1.2d, v4.4s, v2.4s
+; CHECK-GI-NEXT:    umull v2.2d, v5.2s, v3.2s
+; CHECK-GI-NEXT:    umull2 v3.2d, v5.4s, v3.4s
+; CHECK-GI-NEXT:    umull v4.2d, v6.2s, v7.2s
+; CHECK-GI-NEXT:    umull2 v5.2d, v6.4s, v7.4s
+; CHECK-GI-NEXT:    umull v6.2d, v16.2s, v17.2s
+; CHECK-GI-NEXT:    umull2 v7.2d, v16.4s, v17.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %ea = zext <16 x i8> %a to <16 x i64>
   %eb = zext <16 x i8> %b to <16 x i64>
@@ -69,13 +113,21 @@ entry:
 
 
 define <16 x i16> @mla_i16(<16 x i8> %a, <16 x i8> %b, <16 x i16> %c) {
-; CHECK-LABEL: mla_i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    umlal2 v3.8h, v0.16b, v1.16b
-; CHECK-NEXT:    umlal v2.8h, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mla_i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    umlal2 v3.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT:    umlal v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mla_i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    umlal v2.8h, v0.8b, v1.8b
+; CHECK-GI-NEXT:    umlal2 v3.8h, v0.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %ea = zext <16 x i8> %a to <16 x i16>
   %eb = zext <16 x i8> %b to <16 x i16>
@@ -85,21 +137,37 @@ entry:
 }
 
 define <16 x i32> @mla_i32(<16 x i8> %a, <16 x i8> %b, <16 x i32> %c) {
-; CHECK-LABEL: mla_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v6.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v7.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT:    umlal v2.4s, v6.4h, v7.4h
-; CHECK-NEXT:    umlal2 v3.4s, v6.8h, v7.8h
-; CHECK-NEXT:    umlal2 v5.4s, v0.8h, v1.8h
-; CHECK-NEXT:    umlal v4.4s, v0.4h, v1.4h
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
-; CHECK-NEXT:    mov v2.16b, v4.16b
-; CHECK-NEXT:    mov v3.16b, v5.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mla_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v6.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v7.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-SD-NEXT:    umlal v2.4s, v6.4h, v7.4h
+; CHECK-SD-NEXT:    umlal2 v3.4s, v6.8h, v7.8h
+; CHECK-SD-NEXT:    umlal2 v5.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT:    umlal v4.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    mov v2.16b, v4.16b
+; CHECK-SD-NEXT:    mov v3.16b, v5.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mla_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v6.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v7.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    umlal v2.4s, v6.4h, v7.4h
+; CHECK-GI-NEXT:    umlal2 v3.4s, v6.8h, v7.8h
+; CHECK-GI-NEXT:    umlal v4.4s, v0.4h, v1.4h
+; CHECK-GI-NEXT:    umlal2 v5.4s, v0.8h, v1.8h
+; CHECK-GI-NEXT:    mov v0.16b, v2.16b
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    mov v2.16b, v4.16b
+; CHECK-GI-NEXT:    mov v3.16b, v5.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %ea = zext <16 x i8> %a to <16 x i32>
   %eb = zext <16 x i8> %b to <16 x i32>
@@ -109,38 +177,71 @@ entry:
 }
 
 define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) {
-; CHECK-LABEL: mla_i64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov v17.16b, v7.16b
-; CHECK-NEXT:    mov v16.16b, v6.16b
-; CHECK-NEXT:    ushll v6.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v7.8h, v1.8b, #0
-; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NEXT:    ushll v18.4s, v6.4h, #0
-; CHECK-NEXT:    ushll2 v21.4s, v6.8h, #0
-; CHECK-NEXT:    ushll v19.4s, v0.4h, #0
-; CHECK-NEXT:    ushll v20.4s, v7.4h, #0
-; CHECK-NEXT:    ushll v22.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v23.4s, v7.8h, #0
-; CHECK-NEXT:    ldp q6, q7, [sp]
-; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    umlal2 v3.2d, v18.4s, v20.4s
-; CHECK-NEXT:    umlal v2.2d, v18.2s, v20.2s
-; CHECK-NEXT:    umlal v16.2d, v19.2s, v22.2s
-; CHECK-NEXT:    umlal2 v5.2d, v21.4s, v23.4s
-; CHECK-NEXT:    umlal v4.2d, v21.2s, v23.2s
-; CHECK-NEXT:    umlal2 v17.2d, v19.4s, v22.4s
-; CHECK-NEXT:    umlal2 v7.2d, v0.4s, v1.4s
-; CHECK-NEXT:    umlal v6.2d, v0.2s, v1.2s
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
-; CHECK-NEXT:    mov v2.16b, v4.16b
-; CHECK-NEXT:    mov v3.16b, v5.16b
-; CHECK-NEXT:    mov v4.16b, v16.16b
-; CHECK-NEXT:    mov v5.16b, v17.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mla_i64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov v17.16b, v7.16b
+; CHECK-SD-NEXT:    mov v16.16b, v6.16b
+; CHECK-SD-NEXT:    ushll v6.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-SD-NEXT:    ushll v7.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-SD-NEXT:    ushll v18.4s, v6.4h, #0
+; CHECK-SD-NEXT:    ushll2 v21.4s, v6.8h, #0
+; CHECK-SD-NEXT:    ushll v19.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v20.4s, v7.4h, #0
+; CHECK-SD-NEXT:    ushll v22.4s, v1.4h, #0
+; CHECK-SD-NEXT:    ushll2 v23.4s, v7.8h, #0
+; CHECK-SD-NEXT:    ldp q6, q7, [sp]
+; CHECK-SD-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-SD-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-SD-NEXT:    umlal2 v3.2d, v18.4s, v20.4s
+; CHECK-SD-NEXT:    umlal v2.2d, v18.2s, v20.2s
+; CHECK-SD-NEXT:    umlal v16.2d, v19.2s, v22.2s
+; CHECK-SD-NEXT:    umlal2 v5.2d, v21.4s, v23.4s
+; CHECK-SD-NEXT:    umlal v4.2d, v21.2s, v23.2s
+; CHECK-SD-NEXT:    umlal2 v17.2d, v19.4s, v22.4s
+; CHECK-SD-NEXT:    umlal2 v7.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    umlal v6.2d, v0.2s, v1.2s
+; CHECK-SD-NEXT:    mov v0.16b, v2.16b
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    mov v2.16b, v4.16b
+; CHECK-SD-NEXT:    mov v3.16b, v5.16b
+; CHECK-SD-NEXT:    mov v4.16b, v16.16b
+; CHECK-SD-NEXT:    mov v5.16b, v17.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mla_i64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov v16.16b, v2.16b
+; CHECK-GI-NEXT:    mov v17.16b, v3.16b
+; CHECK-GI-NEXT:    mov v2.16b, v4.16b
+; CHECK-GI-NEXT:    mov v3.16b, v5.16b
+; CHECK-GI-NEXT:    mov v4.16b, v6.16b
+; CHECK-GI-NEXT:    mov v5.16b, v7.16b
+; CHECK-GI-NEXT:    ushll v6.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v7.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v18.4s, v6.4h, #0
+; CHECK-GI-NEXT:    ushll v20.4s, v7.4h, #0
+; CHECK-GI-NEXT:    ushll2 v19.4s, v6.8h, #0
+; CHECK-GI-NEXT:    ushll v21.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll2 v22.4s, v7.8h, #0
+; CHECK-GI-NEXT:    ushll v23.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ldp q6, q7, [sp]
+; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-GI-NEXT:    umlal v16.2d, v18.2s, v20.2s
+; CHECK-GI-NEXT:    umlal2 v17.2d, v18.4s, v20.4s
+; CHECK-GI-NEXT:    umlal v2.2d, v19.2s, v22.2s
+; CHECK-GI-NEXT:    umlal2 v3.2d, v19.4s, v22.4s
+; CHECK-GI-NEXT:    umlal v4.2d, v21.2s, v23.2s
+; CHECK-GI-NEXT:    umlal2 v5.2d, v21.4s, v23.4s
+; CHECK-GI-NEXT:    umlal v6.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    umlal2 v7.2d, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mov v0.16b, v16.16b
+; CHECK-GI-NEXT:    mov v1.16b, v17.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %ea = zext <16 x i8> %a to <16 x i64>
   %eb = zext <16 x i8> %b to <16 x i64>



More information about the llvm-commits mailing list