[llvm] a35640f - [AArch64] Extend vecreduce to udot/sdot transformation to support usdot (#120094)

Mon Dec 23 04:34:50 PST 2024

Author: Igor Kirillov
Date: 2024-12-23T12:34:46Z
New Revision: a35640f29e82dffbe87fb75af9b50c6e1312b455

URL: https://github.com/llvm/llvm-project/commit/a35640f29e82dffbe87fb75af9b50c6e1312b455
DIFF: https://github.com/llvm/llvm-project/commit/a35640f29e82dffbe87fb75af9b50c6e1312b455.diff

LOG: [AArch64] Extend vecreduce to udot/sdot transformation to support usdot (#120094)

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/neon-dotreduce.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 505fae4e840f7e..e455fabfe2e8d9 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18177,16 +18177,38 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
   unsigned ExtOpcode = Op0.getOpcode();
   SDValue A = Op0;
   SDValue B;
+  unsigned DotOpcode;
   if (ExtOpcode == ISD::MUL) {
     A = Op0.getOperand(0);
     B = Op0.getOperand(1);
-    if (A.getOpcode() != B.getOpcode() ||
-        A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
+    if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
       return SDValue();
-    ExtOpcode = A.getOpcode();
-  }
-  if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
+    auto OpCodeA = A.getOpcode();
+    if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
+      return SDValue();
+
+    auto OpCodeB = B.getOpcode();
+    if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
+      return SDValue();
+
+    if (OpCodeA == OpCodeB) {
+      DotOpcode =
+          OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
+    } else {
+      // Check USDOT support support
+      if (!ST->hasMatMulInt8())
+        return SDValue();
+      DotOpcode = AArch64ISD::USDOT;
+      if (OpCodeA == ISD::SIGN_EXTEND)
+        std::swap(A, B);
+    }
+  } else if (ExtOpcode == ISD::ZERO_EXTEND) {
+    DotOpcode = AArch64ISD::UDOT;
+  } else if (ExtOpcode == ISD::SIGN_EXTEND) {
+    DotOpcode = AArch64ISD::SDOT;
+  } else {
     return SDValue();
+  }
 
   EVT Op0VT = A.getOperand(0).getValueType();
   bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
@@ -18212,8 +18234,6 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
     NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
     TargetType = MVT::v2i32;
   }
-  auto DotOpcode =
-      (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
   // Handle the case where we need to generate only one Dot operation.
   if (NumOfVecReduce == 1) {
     SDValue Zeros = DAG.getConstant(0, DL, TargetType);

diff  --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index c345c1e50bbbb7..748555d7bdfa15 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod    < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod -global-isel -global-isel-abort=2 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm    < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm -global-isel -global-isel-abort=2 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; CHECK-GI:       warning: Instruction selection used fallback path for test_udot_v5i8
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_udot_v5i8_nomla
@@ -290,6 +290,128 @@ entry:
   ret i32 %x
 }
 
+define i32 @test_usdot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_usdot_v4i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    smull v0.4s, v1.4h, v0.4h
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w0, w8, w2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_v4i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr w8, [x0]
+; CHECK-GI-NEXT:    ldr w9, [x1]
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    uxtb w8, w8
+; CHECK-GI-NEXT:    sxtb w9, w9
+; CHECK-GI-NEXT:    mov b1, v0.b[1]
+; CHECK-GI-NEXT:    mov b3, v0.b[2]
+; CHECK-GI-NEXT:    mov b5, v2.b[2]
+; CHECK-GI-NEXT:    mov b4, v0.b[3]
+; CHECK-GI-NEXT:    mov b0, v2.b[1]
+; CHECK-GI-NEXT:    mov b6, v2.b[3]
+; CHECK-GI-NEXT:    fmov s2, w9
+; CHECK-GI-NEXT:    fmov w10, s1
+; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    fmov w13, s5
+; CHECK-GI-NEXT:    fmov w8, s4
+; CHECK-GI-NEXT:    fmov w12, s0
+; CHECK-GI-NEXT:    uxtb w10, w10
+; CHECK-GI-NEXT:    uxtb w11, w11
+; CHECK-GI-NEXT:    sxtb w13, w13
+; CHECK-GI-NEXT:    uxtb w8, w8
+; CHECK-GI-NEXT:    sxtb w12, w12
+; CHECK-GI-NEXT:    mov v1.h[1], w10
+; CHECK-GI-NEXT:    fmov w10, s6
+; CHECK-GI-NEXT:    fmov s0, w11
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    mov v2.h[1], w12
+; CHECK-GI-NEXT:    sxtb w10, w10
+; CHECK-GI-NEXT:    mov v0.h[1], w8
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mov v3.h[1], w10
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
+; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
+; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w2
+; CHECK-GI-NEXT:    ret
+entry:
+  %0 = load <4 x i8>, ptr %a
+  %1 = zext <4 x i8> %0 to <4 x i32>
+  %2 = load <4 x i8>, ptr %b
+  %3 = sext <4 x i8> %2 to <4 x i32>
+  %4 = mul nsw <4 x i32> %3, %1
+  %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
+  %op.extra = add nsw i32 %5, %sum
+  ret i32 %op.extra
+}
+
+define i32 @test_usdot_v4i8_double(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_v4i8_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-SD-NEXT:    bic v2.4h, #255, lsl #8
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    shl v3.4s, v3.4s, #24
+; CHECK-SD-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    sshr v3.4s, v3.4s, #24
+; CHECK-SD-NEXT:    sshr v1.4s, v1.4s, #24
+; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v2.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_v4i8_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    movi v4.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #24
+; CHECK-GI-NEXT:    shl v3.4s, v3.4s, #24
+; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT:    and v2.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #24
+; CHECK-GI-NEXT:    sshr v3.4s, v3.4s, #24
+; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mul v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %az = zext <4 x i8> %a to <4 x i32>
+  %bz = sext <4 x i8> %b to <4 x i32>
+  %m1 = mul nuw nsw <4 x i32> %az, %bz
+  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m1)
+  %cz = zext <4 x i8> %c to <4 x i32>
+  %dz = sext <4 x i8> %d to <4 x i32>
+  %m2 = mul nuw nsw <4 x i32> %cz, %dz
+  %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m2)
+  %x = add i32 %r1, %r2
+  ret i32 %x
+}
+
 define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v5i8:
 ; CHECK:       // %bb.0: // %entry
@@ -508,6 +630,77 @@ entry:
   ret i32 %2
 }
 
+define i32 @test_usdot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
+; CHECK-SD-LABEL: test_usdot_v8i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d2, [x1]
+; CHECK-SD-NEXT:    usdot v0.2s, v1.8b, v2.8b
+; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_v8i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v2.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mul v2.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT:    mla v2.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v2.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %a
+  %1 = zext <8 x i8> %0 to <8 x i32>
+  %2 = load <8 x i8>, ptr %b
+  %3 = sext <8 x i8> %2 to <8 x i32>
+  %4 = mul nsw <8 x i32> %3, %1
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  ret i32 %5
+}
+
+define i32 @test_usdot_swapped_operands_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
+; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldr d1, [x0]
+; CHECK-SD-NEXT:    ldr d2, [x1]
+; CHECK-SD-NEXT:    usdot v0.2s, v2.8b, v1.8b
+; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_swapped_operands_v8i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr d0, [x0]
+; CHECK-GI-NEXT:    ldr d1, [x1]
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mul v2.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT:    mla v2.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s0, v2.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
+entry:
+  %0 = load <8 x i8>, ptr %a
+  %1 = sext <8 x i8> %0 to <8 x i32>
+  %2 = load <8 x i8>, ptr %b
+  %3 = zext <8 x i8> %2 to <8 x i32>
+  %4 = mul nsw <8 x i32> %3, %1
+  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+  ret i32 %5
+}
 
 define i32 @test_udot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v16i8:
@@ -587,6 +780,101 @@ entry:
   ret i32 %2
 }
 
+define i32 @test_usdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_usdot_v16i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w0, w8, w2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll2 v4.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v6.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll2 v7.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mul v4.4s, v6.4s, v4.4s
+; CHECK-GI-NEXT:    mul v5.4s, v7.4s, v5.4s
+; CHECK-GI-NEXT:    mla v4.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT:    mla v5.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    add v0.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w2
+; CHECK-GI-NEXT:    ret
+entry:
+  %0 = load <16 x i8>, ptr %a
+  %1 = zext <16 x i8> %0 to <16 x i32>
+  %2 = load <16 x i8>, ptr %b
+  %3 = sext <16 x i8> %2 to <16 x i32>
+  %4 = mul nsw <16 x i32> %3, %1
+  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
+  %op.extra = add nsw i32 %5, %sum
+  ret i32 %op.extra
+}
+
+define i32 @test_usdot_swapped_operands_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v1.16b
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w0, w8, w2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_swapped_operands_v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v6.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ushll2 v7.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mul v4.4s, v6.4s, v4.4s
+; CHECK-GI-NEXT:    mul v5.4s, v7.4s, v5.4s
+; CHECK-GI-NEXT:    mla v4.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT:    mla v5.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    add v0.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w2
+; CHECK-GI-NEXT:    ret
+entry:
+  %0 = load <16 x i8>, ptr %a
+  %1 = sext <16 x i8> %0 to <16 x i32>
+  %2 = load <16 x i8>, ptr %b
+  %3 = zext <16 x i8> %2 to <16 x i32>
+  %4 = mul nsw <16 x i32> %3, %1
+  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
+  %op.extra = add nsw i32 %5, %sum
+  ret i32 %op.extra
+}
 
 define i32 @test_udot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
 ; CHECK-SD-LABEL: test_udot_v8i8_double:
@@ -860,6 +1148,240 @@ entry:
   ret i32 %x
 }
 
+
+define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_v8i8_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT:    usdot v5.2s, v0.8b, v1.8b
+; CHECK-SD-NEXT:    usdot v4.2s, v2.8b, v3.8b
+; CHECK-SD-NEXT:    add v0.2s, v5.2s, v4.2s
+; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_v8i8_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll2 v6.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll2 v7.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mul v4.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT:    mul v5.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT:    mla v4.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mla v5.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    addv s0, v4.4s
+; CHECK-GI-NEXT:    addv s1, v5.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %az = zext <8 x i8> %a to <8 x i32>
+  %bz = sext <8 x i8> %b to <8 x i32>
+  %m1 = mul nuw nsw <8 x i32> %az, %bz
+  %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
+  %cz = zext <8 x i8> %c to <8 x i32>
+  %dz = sext <8 x i8> %d to <8 x i32>
+  %m2 = mul nuw nsw <8 x i32> %cz, %dz
+  %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
+  %x = add i32 %r1, %r2
+  ret i32 %x
+}
+
+define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT:    usdot v5.2s, v1.8b, v0.8b
+; CHECK-SD-NEXT:    usdot v4.2s, v3.8b, v2.8b
+; CHECK-SD-NEXT:    add v0.2s, v5.2s, v4.2s
+; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_swapped_operands_v8i8_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll2 v6.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll2 v7.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mul v4.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT:    mul v5.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT:    mla v4.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mla v5.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    addv s0, v4.4s
+; CHECK-GI-NEXT:    addv s1, v5.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %az = sext <8 x i8> %a to <8 x i32>
+  %bz = zext <8 x i8> %b to <8 x i32>
+  %m1 = mul nuw nsw <8 x i32> %az, %bz
+  %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
+  %cz = sext <8 x i8> %c to <8 x i32>
+  %dz = zext <8 x i8> %d to <8 x i32>
+  %m2 = mul nuw nsw <8 x i32> %cz, %dz
+  %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
+  %x = add i32 %r1, %r2
+  ret i32 %x
+}
+
+define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_v16i8_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT:    usdot v5.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    usdot v4.4s, v2.16b, v3.16b
+; CHECK-SD-NEXT:    add v0.4s, v5.4s, v4.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_v16i8_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v4.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v5.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v6.8h, v2.8b, #0
+; CHECK-GI-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-GI-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-GI-NEXT:    ushll2 v16.4s, v4.8h, #0
+; CHECK-GI-NEXT:    ushll2 v17.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v18.4s, v5.8h, #0
+; CHECK-GI-NEXT:    sshll2 v19.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll2 v20.4s, v6.8h, #0
+; CHECK-GI-NEXT:    sshll2 v21.4s, v7.8h, #0
+; CHECK-GI-NEXT:    ushll2 v22.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll2 v23.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v18.4s
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v5.4h, #0
+; CHECK-GI-NEXT:    mul v17.4s, v17.4s, v19.4s
+; CHECK-GI-NEXT:    mul v18.4s, v20.4s, v21.4s
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mul v19.4s, v22.4s, v23.4s
+; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v7.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mla v16.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT:    mla v17.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mla v18.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT:    mla v19.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v16.4s, v17.4s
+; CHECK-GI-NEXT:    add v1.4s, v18.4s, v19.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %az = zext <16 x i8> %a to <16 x i32>
+  %bz = sext <16 x i8> %b to <16 x i32>
+  %m1 = mul nuw nsw <16 x i32> %az, %bz
+  %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
+  %cz = zext <16 x i8> %c to <16 x i32>
+  %dz = sext <16 x i8> %d to <16 x i32>
+  %m2 = mul nuw nsw <16 x i32> %cz, %dz
+  %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
+  %x = add i32 %r1, %r2
+  ret i32 %x
+}
+
+
+define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT:    usdot v5.4s, v1.16b, v0.16b
+; CHECK-SD-NEXT:    usdot v4.4s, v3.16b, v2.16b
+; CHECK-SD-NEXT:    add v0.4s, v5.4s, v4.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_swapped_operands_v16i8_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sshll v4.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v5.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    sshll v6.8h, v2.8b, #0
+; CHECK-GI-NEXT:    ushll v7.8h, v3.8b, #0
+; CHECK-GI-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-GI-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sshll2 v17.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v18.4s, v5.8h, #0
+; CHECK-GI-NEXT:    ushll2 v19.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll2 v20.4s, v6.8h, #0
+; CHECK-GI-NEXT:    ushll2 v21.4s, v7.8h, #0
+; CHECK-GI-NEXT:    sshll2 v22.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll2 v23.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v18.4s
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v5.4h, #0
+; CHECK-GI-NEXT:    mul v17.4s, v17.4s, v19.4s
+; CHECK-GI-NEXT:    mul v18.4s, v20.4s, v21.4s
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mul v19.4s, v22.4s, v23.4s
+; CHECK-GI-NEXT:    sshll v6.4s, v6.4h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v7.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mla v16.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT:    mla v17.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    mla v18.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT:    mla v19.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v16.4s, v17.4s
+; CHECK-GI-NEXT:    add v1.4s, v18.4s, v19.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ret
+entry:
+  %az = sext <16 x i8> %a to <16 x i32>
+  %bz = zext <16 x i8> %b to <16 x i32>
+  %m1 = mul nuw nsw <16 x i32> %az, %bz
+  %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
+  %cz = sext <16 x i8> %c to <16 x i32>
+  %dz = zext <16 x i8> %d to <16 x i32>
+  %m2 = mul nuw nsw <16 x i32> %cz, %dz
+  %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
+  %x = add i32 %r1, %r2
+  ret i32 %x
+}
+
 define i32 @test_udot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-SD-LABEL: test_udot_v24i8:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -1658,7 +2180,6 @@ entry:
   ret i32 %x
 }
 
-
 define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v25i8:
 ; CHECK:       // %bb.0: // %entry
@@ -2301,6 +2822,202 @@ entry:
   ret i32 %x
 }
 
+define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_usdot_v32i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldp q2, q3, [x0]
+; CHECK-SD-NEXT:    ldp q4, q5, [x1]
+; CHECK-SD-NEXT:    usdot v1.4s, v3.16b, v5.16b
+; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v4.16b
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w0, w8, w2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_v32i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ldp q0, q1, [x1]
+; CHECK-GI-NEXT:    ldp q2, q3, [x0]
+; CHECK-GI-NEXT:    sshll v4.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v5.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v6.8h, v2.8b, #0
+; CHECK-GI-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-GI-NEXT:    ushll v7.8h, v3.8b, #0
+; CHECK-GI-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v4.8h, #0
+; CHECK-GI-NEXT:    sshll2 v17.4s, v0.8h, #0
+; CHECK-GI-NEXT:    sshll2 v18.4s, v5.8h, #0
+; CHECK-GI-NEXT:    sshll2 v19.4s, v1.8h, #0
+; CHECK-GI-NEXT:    ushll2 v20.4s, v6.8h, #0
+; CHECK-GI-NEXT:    ushll2 v21.4s, v2.8h, #0
+; CHECK-GI-NEXT:    ushll2 v22.4s, v7.8h, #0
+; CHECK-GI-NEXT:    ushll2 v23.4s, v3.8h, #0
+; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v5.4h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v20.4s
+; CHECK-GI-NEXT:    mul v17.4s, v17.4s, v21.4s
+; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
+; CHECK-GI-NEXT:    mul v18.4s, v18.4s, v22.4s
+; CHECK-GI-NEXT:    mul v19.4s, v19.4s, v23.4s
+; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    ushll v7.4s, v7.4h, #0
+; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    mla v16.4s, v4.4s, v6.4s
+; CHECK-GI-NEXT:    mla v17.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    mla v18.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT:    mla v19.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v16.4s, v17.4s
+; CHECK-GI-NEXT:    add v1.4s, v18.4s, v19.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w2
+; CHECK-GI-NEXT:    ret
+entry:
+  %0 = load <32 x i8>, ptr %a
+  %1 = zext <32 x i8> %0 to <32 x i32>
+  %2 = load <32 x i8>, ptr %b
+  %3 = sext <32 x i8> %2 to <32 x i32>
+  %4 = mul nsw <32 x i32> %3, %1
+  %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
+  %op.extra = add nsw i32 %5, %sum
+  ret i32 %op.extra
+}
+
+define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_v32i8_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-SD-NEXT:    usdot v16.4s, v1.16b, v3.16b
+; CHECK-SD-NEXT:    usdot v18.4s, v0.16b, v2.16b
+; CHECK-SD-NEXT:    usdot v17.4s, v4.16b, v6.16b
+; CHECK-SD-NEXT:    usdot v19.4s, v5.16b, v7.16b
+; CHECK-SD-NEXT:    add v0.4s, v18.4s, v16.4s
+; CHECK-SD-NEXT:    add v1.4s, v17.4s, v19.4s
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_v32i8_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset b8, -8
+; CHECK-GI-NEXT:    .cfi_offset b9, -16
+; CHECK-GI-NEXT:    .cfi_offset b10, -24
+; CHECK-GI-NEXT:    .cfi_offset b11, -32
+; CHECK-GI-NEXT:    .cfi_offset b12, -40
+; CHECK-GI-NEXT:    .cfi_offset b13, -48
+; CHECK-GI-NEXT:    .cfi_offset b14, -56
+; CHECK-GI-NEXT:    .cfi_offset b15, -64
+; CHECK-GI-NEXT:    ushll v16.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ushll v17.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT:    sshll v18.8h, v2.8b, #0
+; CHECK-GI-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-GI-NEXT:    sshll v19.8h, v3.8b, #0
+; CHECK-GI-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-GI-NEXT:    ushll v27.8h, v4.8b, #0
+; CHECK-GI-NEXT:    ushll2 v4.8h, v4.16b, #0
+; CHECK-GI-NEXT:    ushll v28.8h, v5.8b, #0
+; CHECK-GI-NEXT:    sshll v29.8h, v6.8b, #0
+; CHECK-GI-NEXT:    sshll2 v6.8h, v6.16b, #0
+; CHECK-GI-NEXT:    ushll2 v5.8h, v5.16b, #0
+; CHECK-GI-NEXT:    sshll v30.8h, v7.8b, #0
+; CHECK-GI-NEXT:    sshll2 v7.8h, v7.16b, #0
+; CHECK-GI-NEXT:    ushll2 v20.4s, v16.8h, #0
+; CHECK-GI-NEXT:    ushll2 v21.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v22.4s, v17.8h, #0
+; CHECK-GI-NEXT:    ushll2 v23.4s, v1.8h, #0
+; CHECK-GI-NEXT:    sshll2 v24.4s, v18.8h, #0
+; CHECK-GI-NEXT:    sshll2 v25.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll2 v26.4s, v19.8h, #0
+; CHECK-GI-NEXT:    sshll2 v31.4s, v3.8h, #0
+; CHECK-GI-NEXT:    ushll2 v8.4s, v27.8h, #0
+; CHECK-GI-NEXT:    ushll2 v9.4s, v4.8h, #0
+; CHECK-GI-NEXT:    ushll2 v10.4s, v28.8h, #0
+; CHECK-GI-NEXT:    sshll2 v11.4s, v29.8h, #0
+; CHECK-GI-NEXT:    sshll2 v12.4s, v6.8h, #0
+; CHECK-GI-NEXT:    ushll2 v13.4s, v5.8h, #0
+; CHECK-GI-NEXT:    sshll2 v14.4s, v30.8h, #0
+; CHECK-GI-NEXT:    sshll2 v15.4s, v7.8h, #0
+; CHECK-GI-NEXT:    mul v20.4s, v20.4s, v24.4s
+; CHECK-GI-NEXT:    mul v21.4s, v21.4s, v25.4s
+; CHECK-GI-NEXT:    mul v22.4s, v22.4s, v26.4s
+; CHECK-GI-NEXT:    mul v23.4s, v23.4s, v31.4s
+; CHECK-GI-NEXT:    mul v24.4s, v8.4s, v11.4s
+; CHECK-GI-NEXT:    mul v25.4s, v9.4s, v12.4s
+; CHECK-GI-NEXT:    ushll v16.4s, v16.4h, #0
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mul v26.4s, v10.4s, v14.4s
+; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mul v31.4s, v13.4s, v15.4s
+; CHECK-GI-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    ushll v17.4s, v17.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT:    sshll v18.4s, v18.4h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll v19.4s, v19.4h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT:    ushll v27.4s, v27.4h, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    ushll v28.4s, v28.4h, #0
+; CHECK-GI-NEXT:    ushll v5.4s, v5.4h, #0
+; CHECK-GI-NEXT:    sshll v29.4s, v29.4h, #0
+; CHECK-GI-NEXT:    sshll v6.4s, v6.4h, #0
+; CHECK-GI-NEXT:    sshll v30.4s, v30.4h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v7.4h, #0
+; CHECK-GI-NEXT:    mla v20.4s, v16.4s, v18.4s
+; CHECK-GI-NEXT:    mla v21.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    mla v22.4s, v17.4s, v19.4s
+; CHECK-GI-NEXT:    mla v23.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    mla v24.4s, v27.4s, v29.4s
+; CHECK-GI-NEXT:    mla v25.4s, v4.4s, v6.4s
+; CHECK-GI-NEXT:    mla v26.4s, v28.4s, v30.4s
+; CHECK-GI-NEXT:    mla v31.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT:    add v0.4s, v20.4s, v21.4s
+; CHECK-GI-NEXT:    add v1.4s, v22.4s, v23.4s
+; CHECK-GI-NEXT:    add v2.4s, v24.4s, v25.4s
+; CHECK-GI-NEXT:    add v3.4s, v26.4s, v31.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    addv s1, v1.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %az = zext <32 x i8> %a to <32 x i32>
+  %bz = sext <32 x i8> %b to <32 x i32>
+  %m1 = mul nuw nsw <32 x i32> %az, %bz
+  %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m1)
+  %cz = zext <32 x i8> %c to <32 x i32>
+  %dz = sext <32 x i8> %d to <32 x i32>
+  %m2 = mul nuw nsw <32 x i32> %cz, %dz
+  %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m2)
+  %x = add i32 %r1, %r2
+  ret i32 %x
+}
+
+
 define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v33i8:
 ; CHECK:       // %bb.0: // %entry
@@ -2866,6 +3583,7 @@ entry:
   %x = add i32 %r1, %r2
   ret i32 %x
 }
+
 define i32 @test_udot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-SD-LABEL: test_udot_v48i8:
 ; CHECK-SD:       // %bb.0: // %entry
@@ -4527,3 +5245,385 @@ entry:
   %x = add i32 %r1, %r2
   ret i32 %x
 }
+
+define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_usdot_v64i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldp q1, q2, [x0, #32]
+; CHECK-SD-NEXT:    ldp q6, q7, [x1, #32]
+; CHECK-SD-NEXT:    ldp q16, q17, [x0]
+; CHECK-SD-NEXT:    ldp q18, q19, [x1]
+; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v7.16b
+; CHECK-SD-NEXT:    usdot v5.4s, v1.16b, v6.16b
+; CHECK-SD-NEXT:    usdot v4.4s, v17.16b, v19.16b
+; CHECK-SD-NEXT:    usdot v3.4s, v16.16b, v18.16b
+; CHECK-SD-NEXT:    add v0.4s, v4.4s, v0.4s
+; CHECK-SD-NEXT:    add v1.4s, v3.4s, v5.4s
+; CHECK-SD-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w0, w8, w2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_v64i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT:    .cfi_offset b8, -8
+; CHECK-GI-NEXT:    .cfi_offset b9, -16
+; CHECK-GI-NEXT:    .cfi_offset b10, -24
+; CHECK-GI-NEXT:    .cfi_offset b11, -32
+; CHECK-GI-NEXT:    .cfi_offset b12, -40
+; CHECK-GI-NEXT:    .cfi_offset b13, -48
+; CHECK-GI-NEXT:    .cfi_offset b14, -56
+; CHECK-GI-NEXT:    .cfi_offset b15, -64
+; CHECK-GI-NEXT:    ldp q0, q1, [x1]
+; CHECK-GI-NEXT:    ldp q21, q17, [x0]
+; CHECK-GI-NEXT:    ldp q3, q19, [x1, #32]
+; CHECK-GI-NEXT:    ldp q18, q4, [x0, #32]
+; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v5.8h, v0.16b, #0
+; CHECK-GI-NEXT:    sshll v7.8h, v1.8b, #0
+; CHECK-GI-NEXT:    sshll2 v22.8h, v1.16b, #0
+; CHECK-GI-NEXT:    sshll v23.8h, v3.8b, #0
+; CHECK-GI-NEXT:    sshll2 v24.8h, v3.16b, #0
+; CHECK-GI-NEXT:    sshll v25.8h, v19.8b, #0
+; CHECK-GI-NEXT:    sshll2 v26.8h, v19.16b, #0
+; CHECK-GI-NEXT:    ushll v27.8h, v21.8b, #0
+; CHECK-GI-NEXT:    ushll2 v28.8h, v21.16b, #0
+; CHECK-GI-NEXT:    ushll v30.8h, v17.8b, #0
+; CHECK-GI-NEXT:    ushll2 v17.8h, v17.16b, #0
+; CHECK-GI-NEXT:    ushll v8.8h, v18.8b, #0
+; CHECK-GI-NEXT:    ushll2 v18.8h, v18.16b, #0
+; CHECK-GI-NEXT:    ushll v9.8h, v4.8b, #0
+; CHECK-GI-NEXT:    ushll2 v4.8h, v4.16b, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT:    sshll2 v6.4s, v2.8h, #0
+; CHECK-GI-NEXT:    sshll v1.4s, v5.4h, #0
+; CHECK-GI-NEXT:    sshll2 v16.4s, v5.8h, #0
+; CHECK-GI-NEXT:    sshll v2.4s, v7.4h, #0
+; CHECK-GI-NEXT:    sshll2 v20.4s, v7.8h, #0
+; CHECK-GI-NEXT:    sshll v3.4s, v22.4h, #0
+; CHECK-GI-NEXT:    sshll2 v22.4s, v22.8h, #0
+; CHECK-GI-NEXT:    sshll v5.4s, v23.4h, #0
+; CHECK-GI-NEXT:    sshll2 v23.4s, v23.8h, #0
+; CHECK-GI-NEXT:    sshll v7.4s, v24.4h, #0
+; CHECK-GI-NEXT:    sshll2 v24.4s, v24.8h, #0
+; CHECK-GI-NEXT:    sshll v19.4s, v25.4h, #0
+; CHECK-GI-NEXT:    sshll2 v25.4s, v25.8h, #0
+; CHECK-GI-NEXT:    sshll v21.4s, v26.4h, #0
+; CHECK-GI-NEXT:    sshll2 v26.4s, v26.8h, #0
+; CHECK-GI-NEXT:    ushll v29.4s, v27.4h, #0
+; CHECK-GI-NEXT:    ushll2 v27.4s, v27.8h, #0
+; CHECK-GI-NEXT:    ushll v31.4s, v28.4h, #0
+; CHECK-GI-NEXT:    ushll2 v28.4s, v28.8h, #0
+; CHECK-GI-NEXT:    ushll v10.4s, v30.4h, #0
+; CHECK-GI-NEXT:    ushll2 v30.4s, v30.8h, #0
+; CHECK-GI-NEXT:    ushll v11.4s, v17.4h, #0
+; CHECK-GI-NEXT:    ushll2 v17.4s, v17.8h, #0
+; CHECK-GI-NEXT:    ushll2 v12.4s, v8.8h, #0
+; CHECK-GI-NEXT:    ushll2 v13.4s, v18.8h, #0
+; CHECK-GI-NEXT:    ushll2 v14.4s, v9.8h, #0
+; CHECK-GI-NEXT:    ushll2 v15.4s, v4.8h, #0
+; CHECK-GI-NEXT:    mul v6.4s, v6.4s, v27.4s
+; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v28.4s
+; CHECK-GI-NEXT:    mul v20.4s, v20.4s, v30.4s
+; CHECK-GI-NEXT:    mul v17.4s, v22.4s, v17.4s
+; CHECK-GI-NEXT:    ushll v8.4s, v8.4h, #0
+; CHECK-GI-NEXT:    mul v22.4s, v23.4s, v12.4s
+; CHECK-GI-NEXT:    mul v23.4s, v24.4s, v13.4s
+; CHECK-GI-NEXT:    mul v24.4s, v25.4s, v14.4s
+; CHECK-GI-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mul v25.4s, v26.4s, v15.4s
+; CHECK-GI-NEXT:    ushll v18.4s, v18.4h, #0
+; CHECK-GI-NEXT:    ushll v26.4s, v9.4h, #0
+; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT:    mla v6.4s, v0.4s, v29.4s
+; CHECK-GI-NEXT:    mla v16.4s, v1.4s, v31.4s
+; CHECK-GI-NEXT:    mla v20.4s, v2.4s, v10.4s
+; CHECK-GI-NEXT:    mla v17.4s, v3.4s, v11.4s
+; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mla v22.4s, v5.4s, v8.4s
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mla v23.4s, v7.4s, v18.4s
+; CHECK-GI-NEXT:    mla v24.4s, v19.4s, v26.4s
+; CHECK-GI-NEXT:    mla v25.4s, v21.4s, v4.4s
+; CHECK-GI-NEXT:    add v0.4s, v6.4s, v16.4s
+; CHECK-GI-NEXT:    add v1.4s, v20.4s, v17.4s
+; CHECK-GI-NEXT:    add v2.4s, v22.4s, v23.4s
+; CHECK-GI-NEXT:    add v3.4s, v24.4s, v25.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w0, w8, w2
+; CHECK-GI-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ret
+entry:
+  %0 = load <64 x i8>, ptr %a
+  %1 = zext <64 x i8> %0 to <64 x i32>
+  %2 = load <64 x i8>, ptr %b
+  %3 = sext <64 x i8> %2 to <64 x i32>
+  %4 = mul nsw <64 x i32> %3, %1
+  %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4)
+  %op.extra = add nsw i32 %5, %sum
+  ret i32 %op.extra
+}
+
+define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_v64i8_double:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v21.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v22.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v23.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldp q16, q17, [sp, #64]
+; CHECK-SD-NEXT:    movi v24.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v25.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v26.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v27.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldp q19, q20, [sp, #96]
+; CHECK-SD-NEXT:    usdot v18.4s, v3.16b, v7.16b
+; CHECK-SD-NEXT:    ldp q3, q7, [sp, #32]
+; CHECK-SD-NEXT:    usdot v21.4s, v1.16b, v5.16b
+; CHECK-SD-NEXT:    ldp q1, q5, [sp]
+; CHECK-SD-NEXT:    usdot v22.4s, v2.16b, v6.16b
+; CHECK-SD-NEXT:    usdot v23.4s, v0.16b, v4.16b
+; CHECK-SD-NEXT:    usdot v24.4s, v7.16b, v20.16b
+; CHECK-SD-NEXT:    usdot v27.4s, v3.16b, v19.16b
+; CHECK-SD-NEXT:    usdot v26.4s, v5.16b, v17.16b
+; CHECK-SD-NEXT:    usdot v25.4s, v1.16b, v16.16b
+; CHECK-SD-NEXT:    add v0.4s, v21.4s, v18.4s
+; CHECK-SD-NEXT:    add v1.4s, v23.4s, v22.4s
+; CHECK-SD-NEXT:    add v2.4s, v26.4s, v24.4s
+; CHECK-SD-NEXT:    add v3.4s, v25.4s, v27.4s
+; CHECK-SD-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    add v1.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_usdot_v64i8_double:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #304
+; CHECK-GI-NEXT:    stp d15, d14, [sp, #224] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d13, d12, [sp, #240] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d11, d10, [sp, #256] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp d9, d8, [sp, #272] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 304
+; CHECK-GI-NEXT:    .cfi_offset w29, -16
+; CHECK-GI-NEXT:    .cfi_offset b8, -24
+; CHECK-GI-NEXT:    .cfi_offset b9, -32
+; CHECK-GI-NEXT:    .cfi_offset b10, -40
+; CHECK-GI-NEXT:    .cfi_offset b11, -48
+; CHECK-GI-NEXT:    .cfi_offset b12, -56
+; CHECK-GI-NEXT:    .cfi_offset b13, -64
+; CHECK-GI-NEXT:    .cfi_offset b14, -72
+; CHECK-GI-NEXT:    .cfi_offset b15, -80
+; CHECK-GI-NEXT:    ushll v17.8h, v0.8b, #0
+; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    mov v20.16b, v3.16b
+; CHECK-GI-NEXT:    ushll v16.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll2 v18.8h, v1.16b, #0
+; CHECK-GI-NEXT:    ushll v26.8h, v2.8b, #0
+; CHECK-GI-NEXT:    ldp q27, q28, [sp, #304]
+; CHECK-GI-NEXT:    ushll2 v29.8h, v2.16b, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v17.4h, #0
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    sshll v8.8h, v4.8b, #0
+; CHECK-GI-NEXT:    ldp q23, q21, [sp, #368]
+; CHECK-GI-NEXT:    sshll2 v9.8h, v4.16b, #0
+; CHECK-GI-NEXT:    sshll2 v11.8h, v5.16b, #0
+; CHECK-GI-NEXT:    mov v25.16b, v7.16b
+; CHECK-GI-NEXT:    ushll2 v19.4s, v17.8h, #0
+; CHECK-GI-NEXT:    stp q1, q2, [sp, #192] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll2 v17.4s, v18.8h, #0
+; CHECK-GI-NEXT:    ldp q24, q22, [sp, #336]
+; CHECK-GI-NEXT:    sshll v10.8h, v5.8b, #0
+; CHECK-GI-NEXT:    sshll v12.8h, v6.8b, #0
+; CHECK-GI-NEXT:    sshll2 v13.8h, v6.16b, #0
+; CHECK-GI-NEXT:    mov v2.16b, v20.16b
+; CHECK-GI-NEXT:    sshll2 v0.4s, v8.8h, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v9.8h, #0
+; CHECK-GI-NEXT:    sshll2 v6.4s, v11.8h, #0
+; CHECK-GI-NEXT:    ushll2 v7.4s, v16.8h, #0
+; CHECK-GI-NEXT:    ushll2 v31.4s, v29.8h, #0
+; CHECK-GI-NEXT:    sshll2 v5.4s, v10.8h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v13.8h, #0
+; CHECK-GI-NEXT:    ushll2 v30.4s, v26.8h, #0
+; CHECK-GI-NEXT:    ushll v14.8h, v2.8b, #0
+; CHECK-GI-NEXT:    mul v20.4s, v19.4s, v0.4s
+; CHECK-GI-NEXT:    mul v19.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    sshll v0.8h, v25.8b, #0
+; CHECK-GI-NEXT:    mul v4.4s, v17.4s, v6.4s
+; CHECK-GI-NEXT:    sshll2 v15.4s, v12.8h, #0
+; CHECK-GI-NEXT:    ldp q17, q3, [sp, #400]
+; CHECK-GI-NEXT:    mul v5.4s, v7.4s, v5.4s
+; CHECK-GI-NEXT:    mul v7.4s, v31.4s, v1.4s
+; CHECK-GI-NEXT:    ushll2 v31.8h, v2.16b, #0
+; CHECK-GI-NEXT:    sshll2 v25.8h, v25.16b, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-GI-NEXT:    ushll v2.4s, v14.4h, #0
+; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT:    str q3, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ushll2 v3.4s, v14.8h, #0
+; CHECK-GI-NEXT:    mul v6.4s, v30.4s, v15.4s
+; CHECK-GI-NEXT:    str q31, [sp, #160] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ushll v30.4s, v26.4h, #0
+; CHECK-GI-NEXT:    sshll v26.4s, v8.4h, #0
+; CHECK-GI-NEXT:    ushll v14.8h, v27.8b, #0
+; CHECK-GI-NEXT:    ushll v15.4s, v29.4h, #0
+; CHECK-GI-NEXT:    sshll v29.4s, v9.4h, #0
+; CHECK-GI-NEXT:    mul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT:    ushll2 v3.4s, v31.8h, #0
+; CHECK-GI-NEXT:    ushll v31.8h, v28.8b, #0
+; CHECK-GI-NEXT:    ushll v16.4s, v16.4h, #0
+; CHECK-GI-NEXT:    sshll v8.4s, v10.4h, #0
+; CHECK-GI-NEXT:    sshll v9.4s, v11.4h, #0
+; CHECK-GI-NEXT:    sshll v10.4s, v12.4h, #0
+; CHECK-GI-NEXT:    sshll v11.4s, v13.4h, #0
+; CHECK-GI-NEXT:    ushll v18.4s, v18.4h, #0
+; CHECK-GI-NEXT:    stp q3, q25, [sp, #112] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q3, [sp, #208] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ushll2 v28.8h, v28.16b, #0
+; CHECK-GI-NEXT:    mla v1.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    ushll2 v0.4s, v31.8h, #0
+; CHECK-GI-NEXT:    mla v5.4s, v16.4s, v8.4s
+; CHECK-GI-NEXT:    mla v20.4s, v3.4s, v26.4s
+; CHECK-GI-NEXT:    sshll2 v3.4s, v25.8h, #0
+; CHECK-GI-NEXT:    mla v6.4s, v30.4s, v10.4s
+; CHECK-GI-NEXT:    mla v7.4s, v15.4s, v11.4s
+; CHECK-GI-NEXT:    sshll v25.8h, v23.8b, #0
+; CHECK-GI-NEXT:    mla v4.4s, v18.4s, v9.4s
+; CHECK-GI-NEXT:    ushll v30.8h, v22.8b, #0
+; CHECK-GI-NEXT:    ushll2 v26.8h, v22.16b, #0
+; CHECK-GI-NEXT:    sshll v22.8h, v21.8b, #0
+; CHECK-GI-NEXT:    str q3, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ushll2 v8.8h, v27.16b, #0
+; CHECK-GI-NEXT:    str q1, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q9, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ushll2 v1.4s, v14.8h, #0
+; CHECK-GI-NEXT:    stp q7, q6, [sp, #64] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    mla v19.4s, v3.4s, v29.4s
+; CHECK-GI-NEXT:    sshll2 v7.4s, v25.8h, #0
+; CHECK-GI-NEXT:    str q5, [sp, #176] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ushll v29.8h, v24.8b, #0
+; CHECK-GI-NEXT:    ushll2 v27.8h, v24.16b, #0
+; CHECK-GI-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ldp q0, q16, [sp, #96] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    str q4, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    sshll2 v24.8h, v23.16b, #0
+; CHECK-GI-NEXT:    ushll2 v18.4s, v26.8h, #0
+; CHECK-GI-NEXT:    stp q19, q20, [sp, #192] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    sshll2 v20.8h, v21.16b, #0
+; CHECK-GI-NEXT:    sshll v21.8h, v17.8b, #0
+; CHECK-GI-NEXT:    sshll2 v19.8h, v17.16b, #0
+; CHECK-GI-NEXT:    sshll2 v17.8h, v0.16b, #0
+; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v9.4s
+; CHECK-GI-NEXT:    ldr q9, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sshll v23.8h, v0.8b, #0
+; CHECK-GI-NEXT:    sshll2 v2.4s, v22.8h, #0
+; CHECK-GI-NEXT:    ushll2 v12.4s, v27.8h, #0
+; CHECK-GI-NEXT:    ushll v26.4s, v26.4h, #0
+; CHECK-GI-NEXT:    ushll2 v10.4s, v28.8h, #0
+; CHECK-GI-NEXT:    sshll2 v0.4s, v17.8h, #0
+; CHECK-GI-NEXT:    mul v7.4s, v9.4s, v7.4s
+; CHECK-GI-NEXT:    ldr q9, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sshll2 v5.4s, v19.8h, #0
+; CHECK-GI-NEXT:    sshll v17.4s, v17.4h, #0
+; CHECK-GI-NEXT:    sshll2 v3.4s, v20.8h, #0
+; CHECK-GI-NEXT:    mul v2.4s, v9.4s, v2.4s
+; CHECK-GI-NEXT:    ldr q9, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ushll2 v15.4s, v8.8h, #0
+; CHECK-GI-NEXT:    mul v0.4s, v18.4s, v0.4s
+; CHECK-GI-NEXT:    ldr q18, [sp, #160] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ushll2 v11.4s, v29.8h, #0
+; CHECK-GI-NEXT:    sshll v9.4s, v9.4h, #0
+; CHECK-GI-NEXT:    ushll2 v13.4s, v30.8h, #0
+; CHECK-GI-NEXT:    sshll2 v1.4s, v24.8h, #0
+; CHECK-GI-NEXT:    ushll v18.4s, v18.4h, #0
+; CHECK-GI-NEXT:    sshll2 v4.4s, v21.8h, #0
+; CHECK-GI-NEXT:    sshll2 v6.4s, v23.8h, #0
+; CHECK-GI-NEXT:    mul v5.4s, v12.4s, v5.4s
+; CHECK-GI-NEXT:    ushll v27.4s, v27.4h, #0
+; CHECK-GI-NEXT:    sshll v19.4s, v19.4h, #0
+; CHECK-GI-NEXT:    mla v0.4s, v26.4s, v17.4s
+; CHECK-GI-NEXT:    mul v3.4s, v10.4s, v3.4s
+; CHECK-GI-NEXT:    mul v1.4s, v15.4s, v1.4s
+; CHECK-GI-NEXT:    mla v16.4s, v18.4s, v9.4s
+; CHECK-GI-NEXT:    ldp q18, q17, [sp, #192] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mul v4.4s, v11.4s, v4.4s
+; CHECK-GI-NEXT:    mul v6.4s, v13.4s, v6.4s
+; CHECK-GI-NEXT:    ushll v28.4s, v28.4h, #0
+; CHECK-GI-NEXT:    ldp d13, d12, [sp, #240] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    sshll v20.4s, v20.4h, #0
+; CHECK-GI-NEXT:    ushll v10.4s, v14.4h, #0
+; CHECK-GI-NEXT:    ldp d15, d14, [sp, #224] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    ushll v8.4s, v8.4h, #0
+; CHECK-GI-NEXT:    ushll v31.4s, v31.4h, #0
+; CHECK-GI-NEXT:    ushll v29.4s, v29.4h, #0
+; CHECK-GI-NEXT:    ushll v30.4s, v30.4h, #0
+; CHECK-GI-NEXT:    sshll v25.4s, v25.4h, #0
+; CHECK-GI-NEXT:    sshll v24.4s, v24.4h, #0
+; CHECK-GI-NEXT:    sshll v22.4s, v22.4h, #0
+; CHECK-GI-NEXT:    sshll v21.4s, v21.4h, #0
+; CHECK-GI-NEXT:    sshll v23.4s, v23.4h, #0
+; CHECK-GI-NEXT:    mla v5.4s, v27.4s, v19.4s
+; CHECK-GI-NEXT:    ldr q19, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add v17.4s, v17.4s, v18.4s
+; CHECK-GI-NEXT:    ldr q18, [sp, #176] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mla v3.4s, v28.4s, v20.4s
+; CHECK-GI-NEXT:    mla v7.4s, v10.4s, v25.4s
+; CHECK-GI-NEXT:    ldp d11, d10, [sp, #256] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    mla v1.4s, v8.4s, v24.4s
+; CHECK-GI-NEXT:    ldp d9, d8, [sp, #272] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add v18.4s, v18.4s, v19.4s
+; CHECK-GI-NEXT:    ldp q20, q19, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    mla v2.4s, v31.4s, v22.4s
+; CHECK-GI-NEXT:    mla v4.4s, v29.4s, v21.4s
+; CHECK-GI-NEXT:    mla v6.4s, v30.4s, v23.4s
+; CHECK-GI-NEXT:    add v1.4s, v7.4s, v1.4s
+; CHECK-GI-NEXT:    add v19.4s, v19.4s, v20.4s
+; CHECK-GI-NEXT:    ldr q20, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    add v2.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT:    add v16.4s, v20.4s, v16.4s
+; CHECK-GI-NEXT:    add v3.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT:    add v0.4s, v6.4s, v0.4s
+; CHECK-GI-NEXT:    add v4.4s, v17.4s, v18.4s
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    add v5.4s, v19.4s, v16.4s
+; CHECK-GI-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT:    add v2.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    addv s1, v2.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    add w0, w8, w9
+; CHECK-GI-NEXT:    add sp, sp, #304
+; CHECK-GI-NEXT:    ret
+entry:
+  %az = zext <64 x i8> %a to <64 x i32>
+  %bz = sext <64 x i8> %b to <64 x i32>
+  %m1 = mul nuw nsw <64 x i32> %az, %bz
+  %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m1)
+  %cz = zext <64 x i8> %c to <64 x i32>
+  %dz = sext <64 x i8> %d to <64 x i32>
+  %m2 = mul nuw nsw <64 x i32> %cz, %dz
+  %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m2)
+  %x = add i32 %r1, %r2
+  ret i32 %x
+}