[llvm] [AArch64][GlobalISel] Legalize more G_VECREDUCE_ADD operations. (PR #123392)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 17 12:00:48 PST 2025
https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/123392
Non-power-2 vectors will now be padded with zero elements, smaller vectors will be widened using anyext, which I believe will be better in many situations than padding with zeros, although some small types may prefer being scalarized depending on the code. Padding with zeros may not be best for all sizes (v5i8 being the worst), we can hopefully improve that in the future but they no longer fall back. We scalarize other types like i128.
>From 35d8b6b2292bd2a8cdf49ad9ee3181d0e7c0f7f9 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Fri, 17 Jan 2025 18:28:16 +0000
Subject: [PATCH] [AArch64][GlobalISel] Legalize more G_VECREDUCE_ADD
operations.
Non-power-2 vectors will now be padded with zero elements, smaller vectors will
be widened using anyext, which I believe will be better in many situations than
padding with zeros, although some small types may prefer being scalarized
depending on the code. Padding with zeros may not be best for all sizes (v5i8
being the worst), we can hopefully improve that in the future but they no
longer fall back. We scalarize other types like i128.
---
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 9 +
.../AArch64/GISel/AArch64LegalizerInfo.cpp | 4 +-
.../GlobalISel/legalize-reduce-add.mir | 17 +-
.../GlobalISel/legalizer-info-validation.mir | 4 +-
llvm/test/CodeGen/AArch64/aarch64-addv.ll | 99 +-
llvm/test/CodeGen/AArch64/neon-dotreduce.ll | 5665 ++++++++++++-----
llvm/test/CodeGen/AArch64/vecreduce-add.ll | 354 +-
7 files changed, 4141 insertions(+), 2011 deletions(-)
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d0a62340a5f322..5a16d72594363d 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3342,6 +3342,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
Observer.changedInstr(MI);
return Legalized;
}
+ case TargetOpcode::G_VECREDUCE_ADD: {
+ if (TypeIdx != 1)
+ return UnableToLegalize;
+ Observer.changingInstr(MI);
+ widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+ widenScalarDst(MI, WideTy.getScalarType(), 0, TargetOpcode::G_TRUNC);
+ Observer.changedInstr(MI);
+ return Legalized;
+ }
case TargetOpcode::G_VECREDUCE_FADD:
case TargetOpcode::G_VECREDUCE_FMUL:
case TargetOpcode::G_VECREDUCE_FMIN:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 93461e39f95597..fdedf44e0ba1bf 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1215,11 +1215,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
{s32, v4s32},
{s32, v2s32},
{s64, v2s64}})
+ .moreElementsToNextPow2(1)
.clampMaxNumElements(1, s64, 2)
.clampMaxNumElements(1, s32, 4)
.clampMaxNumElements(1, s16, 8)
.clampMaxNumElements(1, s8, 16)
- .lower();
+ .widenVectorEltsToVectorMinSize(1, 64)
+ .scalarize(1);
getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir
index 76fdfd0c301f6d..a18bd0f2f90eb7 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-add.mir
@@ -157,12 +157,17 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
- ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY]](<2 x s64>)
- ; CHECK-NEXT: [[VECREDUCE_ADD1:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY1]](<2 x s64>)
- ; CHECK-NEXT: [[VECREDUCE_ADD2:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[COPY2]](<2 x s64>)
- ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VECREDUCE_ADD]], [[VECREDUCE_ADD1]]
- ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[VECREDUCE_ADD2]]
- ; CHECK-NEXT: $x0 = COPY [[ADD1]](s64)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[DEF]](s64), [[DEF]](s64)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[BUILD_VECTOR]], [[C]](s64), [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[IVEC]], [[C]](s64), [[C1]](s64)
+ ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[IVEC1]]
+ ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[ADD]], [[ADD1]]
+ ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s64) = G_VECREDUCE_ADD [[ADD2]](<2 x s64>)
+ ; CHECK-NEXT: $x0 = COPY [[VECREDUCE_ADD]](s64)
; CHECK-NEXT: RET_ReallyLR implicit $x0
%0:_(<2 x s64>) = COPY $q0
%1:_(<2 x s64>) = COPY $q1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index c2c77b9326cb64..0260e65520774e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -867,8 +867,8 @@
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: G_VECREDUCE_ADD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_VECREDUCE_MUL (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index aba284b4e0d292..104b6d1236d715 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -global-isel-abort=2 -aarch64-neon-syntax=generic 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=generic | FileCheck %s -check-prefixes=CHECK,SDAG
+; RUN: llc < %s -mtriple=aarch64 -global-isel=1 -aarch64-neon-syntax=generic | FileCheck %s --check-prefixes=CHECK,GISEL
declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>)
declare i8 @llvm.vector.reduce.add.v3i8(<3 x i8>)
@@ -22,15 +22,6 @@ declare i64 @llvm.vector.reduce.add.v3i64(<3 x i64>)
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128>)
-; CHECK-GI: warning: Instruction selection used fallback path for addv_v2i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v4i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v2i16
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i16
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i32
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v3i64
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for addv_v2i128
-
define i8 @add_B(ptr %arr) {
; CHECK-LABEL: add_B:
@@ -256,15 +247,26 @@ entry:
}
define i8 @addv_v3i8(<3 x i8> %a) {
-; CHECK-LABEL: addv_v3i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: mov v0.h[0], w0
-; CHECK-NEXT: mov v0.h[1], w1
-; CHECK-NEXT: mov v0.h[2], w2
-; CHECK-NEXT: addv h0, v0.4h
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: addv_v3i8:
+; SDAG: // %bb.0: // %entry
+; SDAG-NEXT: movi v0.2d, #0000000000000000
+; SDAG-NEXT: mov v0.h[0], w0
+; SDAG-NEXT: mov v0.h[1], w1
+; SDAG-NEXT: mov v0.h[2], w2
+; SDAG-NEXT: addv h0, v0.4h
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: addv_v3i8:
+; GISEL: // %bb.0: // %entry
+; GISEL-NEXT: fmov s0, w0
+; GISEL-NEXT: mov w8, #0 // =0x0
+; GISEL-NEXT: mov v0.h[1], w1
+; GISEL-NEXT: mov v0.h[2], w2
+; GISEL-NEXT: mov v0.h[3], w8
+; GISEL-NEXT: addv h0, v0.4h
+; GISEL-NEXT: fmov w0, s0
+; GISEL-NEXT: ret
entry:
%arg1 = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %a)
ret i8 %arg1
@@ -327,13 +329,22 @@ entry:
}
define i16 @addv_v3i16(<3 x i16> %a) {
-; CHECK-LABEL: addv_v3i16:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov v0.h[3], wzr
-; CHECK-NEXT: addv h0, v0.4h
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; SDAG-LABEL: addv_v3i16:
+; SDAG: // %bb.0: // %entry
+; SDAG-NEXT: // kill: def $d0 killed $d0 def $q0
+; SDAG-NEXT: mov v0.h[3], wzr
+; SDAG-NEXT: addv h0, v0.4h
+; SDAG-NEXT: fmov w0, s0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: addv_v3i16:
+; GISEL: // %bb.0: // %entry
+; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT: mov w8, #0 // =0x0
+; GISEL-NEXT: mov v0.h[3], w8
+; GISEL-NEXT: addv h0, v0.4h
+; GISEL-NEXT: fmov w0, s0
+; GISEL-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.add.v3i16(<3 x i16> %a)
ret i16 %arg1
@@ -431,17 +442,29 @@ entry:
}
define i64 @addv_v3i64(<3 x i64> %a) {
-; CHECK-LABEL: addv_v3i64:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: mov v2.d[1], xzr
-; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
-; CHECK-NEXT: addp d0, v0.2d
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: ret
+; SDAG-LABEL: addv_v3i64:
+; SDAG: // %bb.0: // %entry
+; SDAG-NEXT: // kill: def $d2 killed $d2 def $q2
+; SDAG-NEXT: // kill: def $d0 killed $d0 def $q0
+; SDAG-NEXT: // kill: def $d1 killed $d1 def $q1
+; SDAG-NEXT: mov v0.d[1], v1.d[0]
+; SDAG-NEXT: mov v2.d[1], xzr
+; SDAG-NEXT: add v0.2d, v0.2d, v2.2d
+; SDAG-NEXT: addp d0, v0.2d
+; SDAG-NEXT: fmov x0, d0
+; SDAG-NEXT: ret
+;
+; GISEL-LABEL: addv_v3i64:
+; GISEL: // %bb.0: // %entry
+; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT: // kill: def $d2 killed $d2 def $q2
+; GISEL-NEXT: // kill: def $d1 killed $d1 def $q1
+; GISEL-NEXT: mov v0.d[1], v1.d[0]
+; GISEL-NEXT: mov v2.d[1], xzr
+; GISEL-NEXT: add v0.2d, v0.2d, v2.2d
+; GISEL-NEXT: addp d0, v0.2d
+; GISEL-NEXT: fmov x0, d0
+; GISEL-NEXT: ret
entry:
%arg1 = call i64 @llvm.vector.reduce.add.v3i64(<3 x i64> %a)
ret i64 %arg1
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 748555d7bdfa15..8e12446164e89e 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -1,22 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm -global-isel -global-isel-abort=2 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI: warning: Instruction selection used fallback path for test_udot_v5i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_udot_v5i8_nomla
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v5i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v5i8_double
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v5i8_double_nomla
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_udot_v25i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_udot_v25i8_nomla
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v25i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v25i8_double
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v25i8_double_nomla
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_udot_v33i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_udot_v33i8_nomla
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v33i8
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v33i8_double
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sdot_v33i8_double_nomla
+; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.add.v5i32(<5 x i32>)
@@ -413,19 +397,50 @@ entry:
}
define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
-; CHECK-LABEL: test_udot_v5i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0
-; CHECK-NEXT: mov v1.s[0], v2.s[0]
-; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: add w0, w8, w2
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_udot_v5i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldr d0, [x0]
+; CHECK-SD-NEXT: ldr d1, [x1]
+; CHECK-SD-NEXT: umull v0.8h, v1.8b, v0.8b
+; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT: mov v1.s[0], v2.s[0]
+; CHECK-SD-NEXT: uaddw v0.4s, v1.4s, v0.4h
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w8, s0
+; CHECK-SD-NEXT: add w0, w8, w2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_udot_v5i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldrb w8, [x0, #4]
+; CHECK-GI-NEXT: ldrb w9, [x1, #4]
+; CHECK-GI-NEXT: ldrb w10, [x1]
+; CHECK-GI-NEXT: mul w8, w9, w8
+; CHECK-GI-NEXT: ldrb w9, [x0]
+; CHECK-GI-NEXT: mov v0.s[0], w10
+; CHECK-GI-NEXT: mov v1.s[0], w9
+; CHECK-GI-NEXT: ldrb w9, [x1, #1]
+; CHECK-GI-NEXT: mov v2.s[0], w8
+; CHECK-GI-NEXT: ldrb w8, [x0, #1]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: ldrb w9, [x1, #2]
+; CHECK-GI-NEXT: mov v1.s[1], w8
+; CHECK-GI-NEXT: ldrb w8, [x0, #2]
+; CHECK-GI-NEXT: mov v2.s[1], wzr
+; CHECK-GI-NEXT: mov v0.s[2], w9
+; CHECK-GI-NEXT: ldrb w9, [x1, #3]
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: ldrb w8, [x0, #3]
+; CHECK-GI-NEXT: mov v2.s[2], wzr
+; CHECK-GI-NEXT: mov v0.s[3], w9
+; CHECK-GI-NEXT: mov v1.s[3], w8
+; CHECK-GI-NEXT: mov v2.s[3], wzr
+; CHECK-GI-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: addv s0, v2.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ret
entry:
%0 = load <5 x i8>, ptr %a
%1 = zext <5 x i8> %0 to <5 x i32>
@@ -438,17 +453,37 @@ entry:
}
define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) {
-; CHECK-LABEL: test_udot_v5i8_nomla:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0
-; CHECK-NEXT: mov v1.s[0], v2.s[0]
-; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_udot_v5i8_nomla:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldr d0, [x0]
+; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT: mov v1.s[0], v2.s[0]
+; CHECK-SD-NEXT: uaddw v0.4s, v1.4s, v0.4h
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_udot_v5i8_nomla:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldrb w8, [x0]
+; CHECK-GI-NEXT: ldrb w9, [x0, #4]
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: mov v1.s[0], w9
+; CHECK-GI-NEXT: ldrb w8, [x0, #1]
+; CHECK-GI-NEXT: mov v0.s[1], w8
+; CHECK-GI-NEXT: mov v1.s[1], wzr
+; CHECK-GI-NEXT: ldrb w8, [x0, #2]
+; CHECK-GI-NEXT: mov v0.s[2], w8
+; CHECK-GI-NEXT: mov v1.s[2], wzr
+; CHECK-GI-NEXT: ldrb w8, [x0, #3]
+; CHECK-GI-NEXT: mov v0.s[3], w8
+; CHECK-GI-NEXT: mov v1.s[3], wzr
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%0 = load <5 x i8>, ptr %a1
%1 = zext <5 x i8> %0 to <5 x i32>
@@ -456,19 +491,50 @@ entry:
ret i32 %2
}
define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
-; CHECK-LABEL: test_sdot_v5i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: smull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: sshll2 v2.4s, v0.8h, #0
-; CHECK-NEXT: mov v1.s[0], v2.s[0]
-; CHECK-NEXT: saddw v0.4s, v1.4s, v0.4h
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: add w0, w8, w2
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_sdot_v5i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldr d0, [x0]
+; CHECK-SD-NEXT: ldr d1, [x1]
+; CHECK-SD-NEXT: smull v0.8h, v1.8b, v0.8b
+; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT: sshll2 v2.4s, v0.8h, #0
+; CHECK-SD-NEXT: mov v1.s[0], v2.s[0]
+; CHECK-SD-NEXT: saddw v0.4s, v1.4s, v0.4h
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w8, s0
+; CHECK-SD-NEXT: add w0, w8, w2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_sdot_v5i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldrsb w8, [x0, #4]
+; CHECK-GI-NEXT: ldrsb w9, [x1, #4]
+; CHECK-GI-NEXT: ldrsb w10, [x1]
+; CHECK-GI-NEXT: mul w8, w9, w8
+; CHECK-GI-NEXT: ldrsb w9, [x0]
+; CHECK-GI-NEXT: mov v0.s[0], w10
+; CHECK-GI-NEXT: mov v1.s[0], w9
+; CHECK-GI-NEXT: ldrsb w9, [x1, #1]
+; CHECK-GI-NEXT: mov v2.s[0], w8
+; CHECK-GI-NEXT: ldrsb w8, [x0, #1]
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: ldrsb w9, [x1, #2]
+; CHECK-GI-NEXT: mov v1.s[1], w8
+; CHECK-GI-NEXT: ldrsb w8, [x0, #2]
+; CHECK-GI-NEXT: mov v2.s[1], wzr
+; CHECK-GI-NEXT: mov v0.s[2], w9
+; CHECK-GI-NEXT: ldrsb w9, [x1, #3]
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: ldrsb w8, [x0, #3]
+; CHECK-GI-NEXT: mov v2.s[2], wzr
+; CHECK-GI-NEXT: mov v0.s[3], w9
+; CHECK-GI-NEXT: mov v1.s[3], w8
+; CHECK-GI-NEXT: mov v2.s[3], wzr
+; CHECK-GI-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: addv s0, v2.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ret
entry:
%0 = load <5 x i8>, ptr %a
%1 = sext <5 x i8> %0 to <5 x i32>
@@ -481,22 +547,83 @@ entry:
}
define i32 @test_sdot_v5i8_double(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) {
-; CHECK-LABEL: test_sdot_v5i8_double:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: smull v2.8h, v2.8b, v3.8b
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: movi v3.2d, #0000000000000000
-; CHECK-NEXT: sshll2 v4.4s, v0.8h, #0
-; CHECK-NEXT: sshll2 v5.4s, v2.8h, #0
-; CHECK-NEXT: mov v3.s[0], v4.s[0]
-; CHECK-NEXT: mov v1.s[0], v5.s[0]
-; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h
-; CHECK-NEXT: saddw v1.4s, v1.4s, v2.4h
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_sdot_v5i8_double:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: smull v2.8h, v2.8b, v3.8b
+; CHECK-SD-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT: sshll2 v4.4s, v0.8h, #0
+; CHECK-SD-NEXT: sshll2 v5.4s, v2.8h, #0
+; CHECK-SD-NEXT: mov v3.s[0], v4.s[0]
+; CHECK-SD-NEXT: mov v1.s[0], v5.s[0]
+; CHECK-SD-NEXT: saddw v0.4s, v3.4s, v0.4h
+; CHECK-SD-NEXT: saddw v1.4s, v1.4s, v2.4h
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_sdot_v5i8_double:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT: // kill: def $d3 killed $d3 def $q3
+; CHECK-GI-NEXT: smov w8, v0.b[4]
+; CHECK-GI-NEXT: smov w9, v1.b[4]
+; CHECK-GI-NEXT: smov w10, v2.b[4]
+; CHECK-GI-NEXT: smov w11, v3.b[4]
+; CHECK-GI-NEXT: smov w12, v0.b[0]
+; CHECK-GI-NEXT: smov w13, v1.b[0]
+; CHECK-GI-NEXT: smov w14, v2.b[0]
+; CHECK-GI-NEXT: smov w15, v3.b[0]
+; CHECK-GI-NEXT: mul w8, w8, w9
+; CHECK-GI-NEXT: smov w9, v0.b[1]
+; CHECK-GI-NEXT: mul w10, w10, w11
+; CHECK-GI-NEXT: smov w11, v1.b[1]
+; CHECK-GI-NEXT: mov v4.s[0], w12
+; CHECK-GI-NEXT: smov w12, v2.b[1]
+; CHECK-GI-NEXT: mov v5.s[0], w13
+; CHECK-GI-NEXT: smov w13, v3.b[1]
+; CHECK-GI-NEXT: mov v6.s[0], w8
+; CHECK-GI-NEXT: mov v7.s[0], w14
+; CHECK-GI-NEXT: mov v16.s[0], w15
+; CHECK-GI-NEXT: mov v17.s[0], w10
+; CHECK-GI-NEXT: smov w8, v0.b[2]
+; CHECK-GI-NEXT: smov w10, v1.b[2]
+; CHECK-GI-NEXT: smov w14, v2.b[2]
+; CHECK-GI-NEXT: smov w15, v3.b[2]
+; CHECK-GI-NEXT: mov v4.s[1], w9
+; CHECK-GI-NEXT: mov v5.s[1], w11
+; CHECK-GI-NEXT: smov w9, v0.b[3]
+; CHECK-GI-NEXT: smov w11, v1.b[3]
+; CHECK-GI-NEXT: mov v6.s[1], wzr
+; CHECK-GI-NEXT: mov v7.s[1], w12
+; CHECK-GI-NEXT: mov v16.s[1], w13
+; CHECK-GI-NEXT: mov v17.s[1], wzr
+; CHECK-GI-NEXT: smov w12, v2.b[3]
+; CHECK-GI-NEXT: smov w13, v3.b[3]
+; CHECK-GI-NEXT: mov v4.s[2], w8
+; CHECK-GI-NEXT: mov v5.s[2], w10
+; CHECK-GI-NEXT: mov v6.s[2], wzr
+; CHECK-GI-NEXT: mov v7.s[2], w14
+; CHECK-GI-NEXT: mov v16.s[2], w15
+; CHECK-GI-NEXT: mov v17.s[2], wzr
+; CHECK-GI-NEXT: mov v4.s[3], w9
+; CHECK-GI-NEXT: mov v5.s[3], w11
+; CHECK-GI-NEXT: mov v6.s[3], wzr
+; CHECK-GI-NEXT: mov v7.s[3], w12
+; CHECK-GI-NEXT: mov v16.s[3], w13
+; CHECK-GI-NEXT: mov v17.s[3], wzr
+; CHECK-GI-NEXT: mla v6.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: mla v17.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT: addv s0, v6.4s
+; CHECK-GI-NEXT: addv s1, v17.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ret
entry:
%az = sext <5 x i8> %a to <5 x i32>
%bz = sext <5 x i8> %b to <5 x i32>
@@ -511,22 +638,61 @@ entry:
}
define i32 @test_sdot_v5i8_double_nomla(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) {
-; CHECK-LABEL: test_sdot_v5i8_double_nomla:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: sshll v1.8h, v2.8b, #0
-; CHECK-NEXT: movi v2.2d, #0000000000000000
-; CHECK-NEXT: movi v3.2d, #0000000000000000
-; CHECK-NEXT: sshll2 v4.4s, v0.8h, #0
-; CHECK-NEXT: sshll2 v5.4s, v1.8h, #0
-; CHECK-NEXT: mov v3.s[0], v4.s[0]
-; CHECK-NEXT: mov v2.s[0], v5.s[0]
-; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h
-; CHECK-NEXT: saddw v1.4s, v2.4s, v1.4h
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_sdot_v5i8_double_nomla:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: sshll v1.8h, v2.8b, #0
+; CHECK-SD-NEXT: movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT: sshll2 v4.4s, v0.8h, #0
+; CHECK-SD-NEXT: sshll2 v5.4s, v1.8h, #0
+; CHECK-SD-NEXT: mov v3.s[0], v4.s[0]
+; CHECK-SD-NEXT: mov v2.s[0], v5.s[0]
+; CHECK-SD-NEXT: saddw v0.4s, v3.4s, v0.4h
+; CHECK-SD-NEXT: saddw v1.4s, v2.4s, v1.4h
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_sdot_v5i8_double_nomla:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2
+; CHECK-GI-NEXT: smov w8, v0.b[0]
+; CHECK-GI-NEXT: smov w9, v0.b[4]
+; CHECK-GI-NEXT: smov w10, v2.b[0]
+; CHECK-GI-NEXT: smov w11, v2.b[4]
+; CHECK-GI-NEXT: smov w12, v0.b[1]
+; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: smov w8, v2.b[1]
+; CHECK-GI-NEXT: mov v3.s[0], w9
+; CHECK-GI-NEXT: mov v4.s[0], w10
+; CHECK-GI-NEXT: mov v5.s[0], w11
+; CHECK-GI-NEXT: smov w9, v0.b[2]
+; CHECK-GI-NEXT: smov w10, v2.b[2]
+; CHECK-GI-NEXT: smov w11, v2.b[3]
+; CHECK-GI-NEXT: mov v1.s[1], w12
+; CHECK-GI-NEXT: mov v3.s[1], wzr
+; CHECK-GI-NEXT: mov v4.s[1], w8
+; CHECK-GI-NEXT: mov v5.s[1], wzr
+; CHECK-GI-NEXT: smov w8, v0.b[3]
+; CHECK-GI-NEXT: mov v1.s[2], w9
+; CHECK-GI-NEXT: mov v3.s[2], wzr
+; CHECK-GI-NEXT: mov v4.s[2], w10
+; CHECK-GI-NEXT: mov v5.s[2], wzr
+; CHECK-GI-NEXT: mov v1.s[3], w8
+; CHECK-GI-NEXT: mov v3.s[3], wzr
+; CHECK-GI-NEXT: mov v4.s[3], w11
+; CHECK-GI-NEXT: mov v5.s[3], wzr
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT: add v1.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: addv s1, v1.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ret
entry:
%az = sext <5 x i8> %a to <5 x i32>
%r1 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %az)
@@ -2181,27 +2347,153 @@ entry:
}
define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
-; CHECK-LABEL: test_udot_v25i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldp q3, q0, [x1]
-; CHECK-NEXT: movi v5.2d, #0000000000000000
-; CHECK-NEXT: ldp q2, q1, [x0]
-; CHECK-NEXT: umull2 v4.8h, v0.16b, v1.16b
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: umull v1.8h, v3.8b, v2.8b
-; CHECK-NEXT: umull2 v2.8h, v3.16b, v2.16b
-; CHECK-NEXT: ushll v3.4s, v4.4h, #0
-; CHECK-NEXT: uaddl2 v4.4s, v1.8h, v0.8h
-; CHECK-NEXT: uaddl v0.4s, v1.4h, v0.4h
-; CHECK-NEXT: mov v5.s[0], v3.s[0]
-; CHECK-NEXT: uaddw2 v1.4s, v4.4s, v2.8h
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: uaddw v2.4s, v5.4s, v2.4h
-; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: add w0, w8, w2
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_udot_v25i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldp q3, q0, [x1]
+; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT: ldp q2, q1, [x0]
+; CHECK-SD-NEXT: umull2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: umull v1.8h, v3.8b, v2.8b
+; CHECK-SD-NEXT: umull2 v2.8h, v3.16b, v2.16b
+; CHECK-SD-NEXT: ushll v3.4s, v4.4h, #0
+; CHECK-SD-NEXT: uaddl2 v4.4s, v1.8h, v0.8h
+; CHECK-SD-NEXT: uaddl v0.4s, v1.4h, v0.4h
+; CHECK-SD-NEXT: mov v5.s[0], v3.s[0]
+; CHECK-SD-NEXT: uaddw2 v1.4s, v4.4s, v2.8h
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: uaddw v2.4s, v5.4s, v2.4h
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w8, s0
+; CHECK-SD-NEXT: add w0, w8, w2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_udot_v25i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: ldrb w11, [x1, #16]!
+; CHECK-GI-NEXT: ldrb w12, [x1, #4]
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: mov v23.s[0], wzr
+; CHECK-GI-NEXT: umov w9, v1.b[4]
+; CHECK-GI-NEXT: umov w10, v1.b[12]
+; CHECK-GI-NEXT: umov w13, v1.b[0]
+; CHECK-GI-NEXT: umov w14, v1.b[5]
+; CHECK-GI-NEXT: mov v5.s[0], w12
+; CHECK-GI-NEXT: mov v3.s[0], w11
+; CHECK-GI-NEXT: umov w11, v0.b[0]
+; CHECK-GI-NEXT: umov w12, v1.b[1]
+; CHECK-GI-NEXT: umov w15, v1.b[8]
+; CHECK-GI-NEXT: ldrb w8, [x0, #16]!
+; CHECK-GI-NEXT: mov v23.s[1], wzr
+; CHECK-GI-NEXT: mov v2.s[0], w9
+; CHECK-GI-NEXT: mov v4.s[0], w10
+; CHECK-GI-NEXT: umov w10, v1.b[13]
+; CHECK-GI-NEXT: ldrb w9, [x1, #5]
+; CHECK-GI-NEXT: mov v6.s[0], w13
+; CHECK-GI-NEXT: umov w13, v1.b[6]
+; CHECK-GI-NEXT: mov v16.s[0], w11
+; CHECK-GI-NEXT: umov w11, v1.b[2]
+; CHECK-GI-NEXT: mov v7.s[0], w15
+; CHECK-GI-NEXT: mov v5.s[1], w9
+; CHECK-GI-NEXT: ldrb w9, [x1, #6]
+; CHECK-GI-NEXT: umov w15, v1.b[9]
+; CHECK-GI-NEXT: mov v2.s[1], w14
+; CHECK-GI-NEXT: ldrb w14, [x1, #1]
+; CHECK-GI-NEXT: mov v4.s[1], w10
+; CHECK-GI-NEXT: umov w10, v1.b[14]
+; CHECK-GI-NEXT: mov v6.s[1], w12
+; CHECK-GI-NEXT: umov w12, v0.b[1]
+; CHECK-GI-NEXT: mov v3.s[1], w14
+; CHECK-GI-NEXT: umov w14, v0.b[12]
+; CHECK-GI-NEXT: mov v21.s[0], w8
+; CHECK-GI-NEXT: ldrb w8, [x0, #1]
+; CHECK-GI-NEXT: mov v5.s[2], w9
+; CHECK-GI-NEXT: umov w9, v0.b[4]
+; CHECK-GI-NEXT: mov v2.s[2], w13
+; CHECK-GI-NEXT: umov w13, v1.b[7]
+; CHECK-GI-NEXT: mov v7.s[1], w15
+; CHECK-GI-NEXT: mov v4.s[2], w10
+; CHECK-GI-NEXT: umov w10, v1.b[15]
+; CHECK-GI-NEXT: mov v16.s[1], w12
+; CHECK-GI-NEXT: ldrb w12, [x1, #2]
+; CHECK-GI-NEXT: mov v6.s[2], w11
+; CHECK-GI-NEXT: umov w11, v0.b[2]
+; CHECK-GI-NEXT: mov v17.s[0], w9
+; CHECK-GI-NEXT: umov w9, v0.b[8]
+; CHECK-GI-NEXT: mov v18.s[0], w14
+; CHECK-GI-NEXT: mov v2.s[3], w13
+; CHECK-GI-NEXT: ldrb w13, [x1, #7]
+; CHECK-GI-NEXT: mov v3.s[2], w12
+; CHECK-GI-NEXT: ldrb w12, [x0, #4]
+; CHECK-GI-NEXT: mov v4.s[3], w10
+; CHECK-GI-NEXT: umov w10, v0.b[5]
+; CHECK-GI-NEXT: mov v5.s[3], w13
+; CHECK-GI-NEXT: ldrb w13, [x0, #8]
+; CHECK-GI-NEXT: mov v16.s[2], w11
+; CHECK-GI-NEXT: umov w11, v0.b[13]
+; CHECK-GI-NEXT: mov v20.s[0], w12
+; CHECK-GI-NEXT: ldrb w12, [x1, #8]
+; CHECK-GI-NEXT: mov v19.s[0], w9
+; CHECK-GI-NEXT: umov w9, v0.b[6]
+; CHECK-GI-NEXT: umov w15, v1.b[10]
+; CHECK-GI-NEXT: mul w12, w12, w13
+; CHECK-GI-NEXT: mov v17.s[1], w10
+; CHECK-GI-NEXT: ldrb w10, [x0, #5]
+; CHECK-GI-NEXT: umov w13, v0.b[9]
+; CHECK-GI-NEXT: mov v21.s[1], w8
+; CHECK-GI-NEXT: umov w8, v1.b[11]
+; CHECK-GI-NEXT: mov v18.s[1], w11
+; CHECK-GI-NEXT: umov w11, v0.b[14]
+; CHECK-GI-NEXT: mov v20.s[1], w10
+; CHECK-GI-NEXT: ldrb w10, [x0, #6]
+; CHECK-GI-NEXT: mov v22.s[0], w12
+; CHECK-GI-NEXT: umov w12, v0.b[7]
+; CHECK-GI-NEXT: mov v17.s[2], w9
+; CHECK-GI-NEXT: umov w9, v0.b[10]
+; CHECK-GI-NEXT: mov v7.s[2], w15
+; CHECK-GI-NEXT: mov v19.s[1], w13
+; CHECK-GI-NEXT: umov w13, v1.b[3]
+; CHECK-GI-NEXT: mov v23.s[2], wzr
+; CHECK-GI-NEXT: mov v18.s[2], w11
+; CHECK-GI-NEXT: umov w11, v0.b[15]
+; CHECK-GI-NEXT: mov v20.s[2], w10
+; CHECK-GI-NEXT: ldrb w10, [x0, #2]
+; CHECK-GI-NEXT: mov v22.s[1], wzr
+; CHECK-GI-NEXT: mov v17.s[3], w12
+; CHECK-GI-NEXT: ldrb w12, [x0, #7]
+; CHECK-GI-NEXT: mov v7.s[3], w8
+; CHECK-GI-NEXT: ldrb w8, [x0, #3]
+; CHECK-GI-NEXT: mov v19.s[2], w9
+; CHECK-GI-NEXT: umov w9, v0.b[3]
+; CHECK-GI-NEXT: mov v18.s[3], w11
+; CHECK-GI-NEXT: umov w11, v0.b[11]
+; CHECK-GI-NEXT: mov v21.s[2], w10
+; CHECK-GI-NEXT: ldrb w10, [x1, #3]
+; CHECK-GI-NEXT: mov v20.s[3], w12
+; CHECK-GI-NEXT: mov v22.s[2], wzr
+; CHECK-GI-NEXT: mov v6.s[3], w13
+; CHECK-GI-NEXT: mul v0.4s, v2.4s, v17.4s
+; CHECK-GI-NEXT: mov v23.s[3], wzr
+; CHECK-GI-NEXT: mov v3.s[3], w10
+; CHECK-GI-NEXT: mov v16.s[3], w9
+; CHECK-GI-NEXT: mov v19.s[3], w11
+; CHECK-GI-NEXT: mul v1.4s, v4.4s, v18.4s
+; CHECK-GI-NEXT: mov v21.s[3], w8
+; CHECK-GI-NEXT: mul v2.4s, v5.4s, v20.4s
+; CHECK-GI-NEXT: mov v22.s[3], wzr
+; CHECK-GI-NEXT: mla v0.4s, v6.4s, v16.4s
+; CHECK-GI-NEXT: mla v1.4s, v7.4s, v19.4s
+; CHECK-GI-NEXT: mla v2.4s, v3.4s, v21.4s
+; CHECK-GI-NEXT: add v3.4s, v22.4s, v23.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ret
entry:
%0 = load <25 x i8>, ptr %a
%1 = zext <25 x i8> %0 to <25 x i32>
@@ -2214,25 +2506,96 @@ entry:
}
define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) {
-; CHECK-LABEL: test_udot_v25i8_nomla:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldp q2, q1, [x0]
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: ushll v4.8h, v2.8b, #0
-; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0
-; CHECK-NEXT: ushll v3.4s, v3.4h, #0
-; CHECK-NEXT: uaddl2 v5.4s, v4.8h, v1.8h
-; CHECK-NEXT: uaddl v1.4s, v4.4h, v1.4h
-; CHECK-NEXT: mov v0.s[0], v3.s[0]
-; CHECK-NEXT: uaddw2 v3.4s, v5.4s, v2.8h
-; CHECK-NEXT: add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h
-; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_udot_v25i8_nomla:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldp q2, q1, [x0]
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: ushll2 v3.8h, v1.16b, #0
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v4.8h, v2.8b, #0
+; CHECK-SD-NEXT: ushll2 v2.8h, v2.16b, #0
+; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-SD-NEXT: uaddl2 v5.4s, v4.8h, v1.8h
+; CHECK-SD-NEXT: uaddl v1.4s, v4.4h, v1.4h
+; CHECK-SD-NEXT: mov v0.s[0], v3.s[0]
+; CHECK-SD-NEXT: uaddw2 v3.4s, v5.4s, v2.8h
+; CHECK-SD-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v2.4h
+; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_udot_v25i8_nomla:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr q1, [x0]
+; CHECK-GI-NEXT: ldrb w11, [x0, #16]!
+; CHECK-GI-NEXT: ldrb w14, [x0, #4]
+; CHECK-GI-NEXT: ldrb w17, [x0, #8]
+; CHECK-GI-NEXT: mov v0.s[0], wzr
+; CHECK-GI-NEXT: umov w12, v1.b[0]
+; CHECK-GI-NEXT: umov w13, v1.b[4]
+; CHECK-GI-NEXT: umov w15, v1.b[8]
+; CHECK-GI-NEXT: umov w16, v1.b[12]
+; CHECK-GI-NEXT: umov w18, v1.b[1]
+; CHECK-GI-NEXT: umov w1, v1.b[5]
+; CHECK-GI-NEXT: umov w2, v1.b[9]
+; CHECK-GI-NEXT: umov w3, v1.b[13]
+; CHECK-GI-NEXT: mov v4.s[0], w11
+; CHECK-GI-NEXT: mov v7.s[0], w14
+; CHECK-GI-NEXT: mov v16.s[0], w17
+; CHECK-GI-NEXT: ldrb w8, [x0, #1]
+; CHECK-GI-NEXT: mov v2.s[0], w12
+; CHECK-GI-NEXT: mov v3.s[0], w13
+; CHECK-GI-NEXT: ldrb w10, [x0, #5]
+; CHECK-GI-NEXT: mov v5.s[0], w15
+; CHECK-GI-NEXT: mov v6.s[0], w16
+; CHECK-GI-NEXT: ldrb w16, [x0, #2]
+; CHECK-GI-NEXT: umov w9, v1.b[2]
+; CHECK-GI-NEXT: umov w12, v1.b[6]
+; CHECK-GI-NEXT: ldrb w17, [x0, #6]
+; CHECK-GI-NEXT: umov w14, v1.b[10]
+; CHECK-GI-NEXT: umov w15, v1.b[14]
+; CHECK-GI-NEXT: mov v4.s[1], w8
+; CHECK-GI-NEXT: mov v2.s[1], w18
+; CHECK-GI-NEXT: mov v3.s[1], w1
+; CHECK-GI-NEXT: mov v7.s[1], w10
+; CHECK-GI-NEXT: mov v5.s[1], w2
+; CHECK-GI-NEXT: mov v6.s[1], w3
+; CHECK-GI-NEXT: mov v16.s[1], wzr
+; CHECK-GI-NEXT: mov v0.s[1], wzr
+; CHECK-GI-NEXT: umov w11, v1.b[3]
+; CHECK-GI-NEXT: umov w13, v1.b[7]
+; CHECK-GI-NEXT: umov w8, v1.b[11]
+; CHECK-GI-NEXT: umov w10, v1.b[15]
+; CHECK-GI-NEXT: mov v4.s[2], w16
+; CHECK-GI-NEXT: mov v2.s[2], w9
+; CHECK-GI-NEXT: ldrb w9, [x0, #3]
+; CHECK-GI-NEXT: mov v3.s[2], w12
+; CHECK-GI-NEXT: ldrb w12, [x0, #7]
+; CHECK-GI-NEXT: mov v5.s[2], w14
+; CHECK-GI-NEXT: mov v6.s[2], w15
+; CHECK-GI-NEXT: mov v7.s[2], w17
+; CHECK-GI-NEXT: mov v16.s[2], wzr
+; CHECK-GI-NEXT: mov v0.s[2], wzr
+; CHECK-GI-NEXT: mov v4.s[3], w9
+; CHECK-GI-NEXT: mov v2.s[3], w11
+; CHECK-GI-NEXT: mov v3.s[3], w13
+; CHECK-GI-NEXT: mov v5.s[3], w8
+; CHECK-GI-NEXT: mov v6.s[3], w10
+; CHECK-GI-NEXT: mov v7.s[3], w12
+; CHECK-GI-NEXT: mov v16.s[3], wzr
+; CHECK-GI-NEXT: mov v0.s[3], wzr
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v2.4s, v5.4s, v6.4s
+; CHECK-GI-NEXT: add v3.4s, v4.4s, v7.4s
+; CHECK-GI-NEXT: add v0.4s, v16.4s, v0.4s
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
entry:
%0 = load <25 x i8>, ptr %a1
%1 = zext <25 x i8> %0 to <25 x i32>
@@ -2240,467 +2603,1114 @@ entry:
ret i32 %2
}
define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
-; CHECK-LABEL: test_sdot_v25i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldp q3, q0, [x1]
-; CHECK-NEXT: movi v5.2d, #0000000000000000
-; CHECK-NEXT: ldp q2, q1, [x0]
-; CHECK-NEXT: smull2 v4.8h, v0.16b, v1.16b
-; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: smull v1.8h, v3.8b, v2.8b
-; CHECK-NEXT: smull2 v2.8h, v3.16b, v2.16b
-; CHECK-NEXT: sshll v3.4s, v4.4h, #0
-; CHECK-NEXT: saddl2 v4.4s, v1.8h, v0.8h
-; CHECK-NEXT: saddl v0.4s, v1.4h, v0.4h
-; CHECK-NEXT: mov v5.s[0], v3.s[0]
-; CHECK-NEXT: saddw2 v1.4s, v4.4s, v2.8h
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h
-; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: add w0, w8, w2
-; CHECK-NEXT: ret
-entry:
- %0 = load <25 x i8>, ptr %a
- %1 = sext <25 x i8> %0 to <25 x i32>
- %2 = load <25 x i8>, ptr %b
- %3 = sext <25 x i8> %2 to <25 x i32>
- %4 = mul nsw <25 x i32> %3, %1
- %5 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %4)
- %op.extra = add nsw i32 %5, %sum
- ret i32 %op.extra
-}
-
-define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 x i8> %d) {
-; CHECK-LABEL: test_sdot_v25i8_double:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ldr b0, [sp, #216]
-; CHECK-NEXT: add x8, sp, #224
-; CHECK-NEXT: ldr b1, [sp, #16]
-; CHECK-NEXT: ldr b2, [sp, #280]
-; CHECK-NEXT: add x9, sp, #240
-; CHECK-NEXT: ldr b4, [sp, #80]
-; CHECK-NEXT: ld1 { v0.b }[1], [x8]
-; CHECK-NEXT: add x8, sp, #24
-; CHECK-NEXT: add x10, sp, #48
-; CHECK-NEXT: ld1 { v1.b }[1], [x8]
-; CHECK-NEXT: add x8, sp, #232
-; CHECK-NEXT: add x11, sp, #96
-; CHECK-NEXT: ldr b5, [sp, #152]
-; CHECK-NEXT: add x12, sp, #168
-; CHECK-NEXT: ldr b6, [sp, #616]
-; CHECK-NEXT: ld1 { v0.b }[2], [x8]
-; CHECK-NEXT: add x8, sp, #32
-; CHECK-NEXT: fmov s3, w0
-; CHECK-NEXT: ld1 { v1.b }[2], [x8]
-; CHECK-NEXT: add x8, sp, #288
-; CHECK-NEXT: ldr b7, [sp, #416]
-; CHECK-NEXT: ld1 { v2.b }[1], [x8]
-; CHECK-NEXT: add x8, sp, #40
-; CHECK-NEXT: ldr b22, [sp, #744]
-; CHECK-NEXT: ld1 { v0.b }[3], [x9]
-; CHECK-NEXT: add x9, sp, #248
-; CHECK-NEXT: mov v3.b[1], w1
-; CHECK-NEXT: ld1 { v1.b }[3], [x8]
-; CHECK-NEXT: add x8, sp, #88
-; CHECK-NEXT: ldr b23, [sp, #544]
-; CHECK-NEXT: ld1 { v4.b }[1], [x8]
-; CHECK-NEXT: add x8, sp, #256
-; CHECK-NEXT: ldr b19, [sp, #680]
-; CHECK-NEXT: ld1 { v0.b }[4], [x9]
-; CHECK-NEXT: add x9, sp, #296
-; CHECK-NEXT: ldr b20, [sp, #480]
-; CHECK-NEXT: ld1 { v1.b }[4], [x10]
-; CHECK-NEXT: ld1 { v2.b }[2], [x9]
-; CHECK-NEXT: add x10, sp, #160
-; CHECK-NEXT: ld1 { v4.b }[2], [x11]
-; CHECK-NEXT: add x11, sp, #304
-; CHECK-NEXT: ld1 { v5.b }[1], [x10]
-; CHECK-NEXT: ld1 { v0.b }[5], [x8]
-; CHECK-NEXT: add x8, sp, #56
-; CHECK-NEXT: add x10, sp, #264
-; CHECK-NEXT: ld1 { v1.b }[5], [x8]
-; CHECK-NEXT: add x8, sp, #64
-; CHECK-NEXT: ld1 { v2.b }[3], [x11]
-; CHECK-NEXT: add x9, sp, #272
-; CHECK-NEXT: ld1 { v5.b }[2], [x12]
-; CHECK-NEXT: add x11, sp, #72
-; CHECK-NEXT: ld1 { v0.b }[6], [x10]
-; CHECK-NEXT: add x10, sp, #312
-; CHECK-NEXT: mov v3.b[2], w2
-; CHECK-NEXT: ld1 { v1.b }[6], [x8]
-; CHECK-NEXT: add x8, sp, #104
-; CHECK-NEXT: ld1 { v2.b }[4], [x10]
-; CHECK-NEXT: ld1 { v4.b }[3], [x8]
-; CHECK-NEXT: add x8, sp, #112
-; CHECK-NEXT: add x10, sp, #128
-; CHECK-NEXT: ld1 { v0.b }[7], [x9]
-; CHECK-NEXT: add x9, sp, #320
-; CHECK-NEXT: ldr b21, [sp, #552]
-; CHECK-NEXT: ld1 { v2.b }[5], [x9]
-; CHECK-NEXT: add x9, sp, #176
-; CHECK-NEXT: ld1 { v1.b }[7], [x11]
-; CHECK-NEXT: ld1 { v4.b }[4], [x8]
-; CHECK-NEXT: add x8, sp, #624
-; CHECK-NEXT: ld1 { v5.b }[3], [x9]
-; CHECK-NEXT: ld1 { v6.b }[1], [x8]
-; CHECK-NEXT: add x8, sp, #120
-; CHECK-NEXT: add x9, sp, #328
-; CHECK-NEXT: ld1 { v2.b }[6], [x9]
-; CHECK-NEXT: add x9, sp, #184
-; CHECK-NEXT: add x11, sp, #192
-; CHECK-NEXT: ld1 { v4.b }[5], [x8]
-; CHECK-NEXT: add x8, sp, #632
-; CHECK-NEXT: ld1 { v5.b }[4], [x9]
-; CHECK-NEXT: ld1 { v6.b }[2], [x8]
-; CHECK-NEXT: add x9, sp, #640
-; CHECK-NEXT: add x8, sp, #336
-; CHECK-NEXT: ld1 { v2.b }[7], [x8]
-; CHECK-NEXT: add x8, sp, #656
-; CHECK-NEXT: smull v23.8h, v23.8b, v22.8b
-; CHECK-NEXT: ld1 { v5.b }[5], [x11]
-; CHECK-NEXT: add x11, sp, #648
-; CHECK-NEXT: ld1 { v4.b }[6], [x10]
-; CHECK-NEXT: ld1 { v6.b }[3], [x9]
-; CHECK-NEXT: add x9, sp, #200
-; CHECK-NEXT: add x10, sp, #136
-; CHECK-NEXT: ldr b22, [sp, #352]
-; CHECK-NEXT: add x12, sp, #360
-; CHECK-NEXT: mov v3.b[3], w3
-; CHECK-NEXT: ld1 { v5.b }[6], [x9]
-; CHECK-NEXT: add x9, sp, #208
-; CHECK-NEXT: ld1 { v4.b }[7], [x10]
-; CHECK-NEXT: ld1 { v6.b }[4], [x11]
-; CHECK-NEXT: add x11, sp, #424
-; CHECK-NEXT: add x10, sp, #488
-; CHECK-NEXT: ld1 { v7.b }[1], [x11]
-; CHECK-NEXT: add x11, sp, #560
-; CHECK-NEXT: ld1 { v20.b }[1], [x10]
-; CHECK-NEXT: ld1 { v5.b }[7], [x9]
-; CHECK-NEXT: add x9, sp, #440
-; CHECK-NEXT: ld1 { v21.b }[1], [x11]
-; CHECK-NEXT: ld1 { v6.b }[5], [x8]
-; CHECK-NEXT: add x8, sp, #432
-; CHECK-NEXT: ld1 { v22.b }[1], [x12]
-; CHECK-NEXT: ld1 { v7.b }[2], [x8]
-; CHECK-NEXT: add x11, sp, #496
-; CHECK-NEXT: add x12, sp, #568
-; CHECK-NEXT: add x13, sp, #368
-; CHECK-NEXT: ld1 { v20.b }[2], [x11]
-; CHECK-NEXT: ld1 { v21.b }[2], [x12]
-; CHECK-NEXT: ld1 { v22.b }[2], [x13]
-; CHECK-NEXT: add x10, sp, #448
-; CHECK-NEXT: mov v3.b[4], w4
-; CHECK-NEXT: ld1 { v7.b }[3], [x9]
-; CHECK-NEXT: add x9, sp, #688
-; CHECK-NEXT: add x11, sp, #576
-; CHECK-NEXT: ld1 { v19.b }[1], [x9]
-; CHECK-NEXT: add x9, sp, #696
-; CHECK-NEXT: add x12, sp, #376
-; CHECK-NEXT: ld1 { v21.b }[3], [x11]
-; CHECK-NEXT: ld1 { v22.b }[3], [x12]
-; CHECK-NEXT: add x11, sp, #512
-; CHECK-NEXT: ld1 { v7.b }[4], [x10]
-; CHECK-NEXT: add x10, sp, #504
-; CHECK-NEXT: add x12, sp, #584
-; CHECK-NEXT: ld1 { v19.b }[2], [x9]
-; CHECK-NEXT: add x9, sp, #704
-; CHECK-NEXT: ld1 { v20.b }[3], [x10]
-; CHECK-NEXT: add x13, sp, #384
-; CHECK-NEXT: mov v3.b[5], w5
-; CHECK-NEXT: ld1 { v21.b }[4], [x12]
-; CHECK-NEXT: ld1 { v22.b }[4], [x13]
-; CHECK-NEXT: add x10, sp, #456
-; CHECK-NEXT: ldr b16, [sp, #344]
-; CHECK-NEXT: ld1 { v19.b }[3], [x9]
-; CHECK-NEXT: add x9, sp, #712
-; CHECK-NEXT: ld1 { v20.b }[4], [x11]
-; CHECK-NEXT: ldr b17, [sp, #144]
-; CHECK-NEXT: ld1 { v7.b }[5], [x10]
-; CHECK-NEXT: add x10, sp, #520
-; CHECK-NEXT: add x11, sp, #592
-; CHECK-NEXT: add x12, sp, #392
-; CHECK-NEXT: mov v3.b[6], w6
-; CHECK-NEXT: ld1 { v19.b }[4], [x9]
-; CHECK-NEXT: add x9, sp, #720
-; CHECK-NEXT: ld1 { v20.b }[5], [x10]
-; CHECK-NEXT: ld1 { v21.b }[5], [x11]
-; CHECK-NEXT: ld1 { v22.b }[5], [x12]
-; CHECK-NEXT: smull v16.8h, v17.8b, v16.8b
-; CHECK-NEXT: add x8, sp, #664
-; CHECK-NEXT: add x10, sp, #464
-; CHECK-NEXT: add x11, sp, #528
-; CHECK-NEXT: ld1 { v19.b }[5], [x9]
-; CHECK-NEXT: add x9, sp, #728
-; CHECK-NEXT: add x12, sp, #600
-; CHECK-NEXT: add x13, sp, #400
-; CHECK-NEXT: ld1 { v6.b }[6], [x8]
-; CHECK-NEXT: ld1 { v20.b }[6], [x11]
-; CHECK-NEXT: ld1 { v21.b }[6], [x12]
-; CHECK-NEXT: ld1 { v22.b }[6], [x13]
-; CHECK-NEXT: ld1 { v7.b }[6], [x10]
-; CHECK-NEXT: ld1 { v19.b }[6], [x9]
-; CHECK-NEXT: add x9, sp, #736
-; CHECK-NEXT: mov v3.b[7], w7
-; CHECK-NEXT: sshll v18.4s, v16.4h, #0
-; CHECK-NEXT: movi v16.2d, #0000000000000000
-; CHECK-NEXT: movi v17.2d, #0000000000000000
-; CHECK-NEXT: add x8, sp, #672
-; CHECK-NEXT: add x10, sp, #472
-; CHECK-NEXT: add x11, sp, #608
-; CHECK-NEXT: ld1 { v19.b }[7], [x9]
-; CHECK-NEXT: add x9, sp, #536
-; CHECK-NEXT: add x12, sp, #408
-; CHECK-NEXT: ld1 { v20.b }[7], [x9]
-; CHECK-NEXT: ld1 { v21.b }[7], [x11]
-; CHECK-NEXT: ld1 { v22.b }[7], [x12]
-; CHECK-NEXT: ld1 { v6.b }[7], [x8]
-; CHECK-NEXT: ld1 { v7.b }[7], [x10]
-; CHECK-NEXT: sshll v23.4s, v23.4h, #0
-; CHECK-NEXT: smull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT: smull v1.8h, v4.8b, v2.8b
-; CHECK-NEXT: smull v2.8h, v3.8b, v5.8b
-; CHECK-NEXT: smull v3.8h, v20.8b, v19.8b
-; CHECK-NEXT: smull v4.8h, v22.8b, v21.8b
-; CHECK-NEXT: mov v17.s[0], v18.s[0]
-; CHECK-NEXT: smull v5.8h, v7.8b, v6.8b
-; CHECK-NEXT: mov v16.s[0], v23.s[0]
-; CHECK-NEXT: saddl2 v6.4s, v2.8h, v1.8h
-; CHECK-NEXT: saddl v1.4s, v2.4h, v1.4h
-; CHECK-NEXT: saddl2 v2.4s, v4.8h, v3.8h
-; CHECK-NEXT: saddl v3.4s, v4.4h, v3.4h
-; CHECK-NEXT: saddw v4.4s, v17.4s, v0.4h
-; CHECK-NEXT: saddw v7.4s, v16.4s, v5.4h
-; CHECK-NEXT: saddw2 v0.4s, v6.4s, v0.8h
-; CHECK-NEXT: add v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: saddw2 v2.4s, v2.4s, v5.8h
-; CHECK-NEXT: add v3.4s, v3.4s, v7.4s
-; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: add v1.4s, v3.4s, v2.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT: ret
-entry:
- %az = sext <25 x i8> %a to <25 x i32>
- %bz = sext <25 x i8> %b to <25 x i32>
- %m1 = mul nuw nsw <25 x i32> %az, %bz
- %r1 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %m1)
- %cz = sext <25 x i8> %c to <25 x i32>
- %dz = sext <25 x i8> %d to <25 x i32>
- %m2 = mul nuw nsw <25 x i32> %cz, %dz
- %r2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %m2)
- %x = add i32 %r1, %r2
- ret i32 %x
-}
-
-define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 x i8> %d) {
-; CHECK-LABEL: test_sdot_v25i8_double_nomla:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: fmov s0, w0
-; CHECK-NEXT: ldr b1, [sp, #80]
-; CHECK-NEXT: add x10, sp, #88
-; CHECK-NEXT: ldr b2, [sp, #16]
-; CHECK-NEXT: add x9, sp, #96
-; CHECK-NEXT: ldr b3, [sp, #480]
-; CHECK-NEXT: ld1 { v1.b }[1], [x10]
-; CHECK-NEXT: add x10, sp, #24
-; CHECK-NEXT: ldr b4, [sp, #352]
-; CHECK-NEXT: mov v0.b[1], w1
-; CHECK-NEXT: ld1 { v2.b }[1], [x10]
-; CHECK-NEXT: add x11, sp, #488
-; CHECK-NEXT: add x10, sp, #360
-; CHECK-NEXT: ldr b5, [sp, #416]
-; CHECK-NEXT: add x8, sp, #104
-; CHECK-NEXT: ld1 { v1.b }[2], [x9]
-; CHECK-NEXT: add x9, sp, #32
-; CHECK-NEXT: ld1 { v3.b }[1], [x11]
-; CHECK-NEXT: ld1 { v2.b }[2], [x9]
-; CHECK-NEXT: add x11, sp, #424
-; CHECK-NEXT: ld1 { v4.b }[1], [x10]
-; CHECK-NEXT: mov v0.b[2], w2
-; CHECK-NEXT: ld1 { v5.b }[1], [x11]
-; CHECK-NEXT: add x9, sp, #368
-; CHECK-NEXT: ld1 { v1.b }[3], [x8]
-; CHECK-NEXT: add x8, sp, #40
-; CHECK-NEXT: add x12, sp, #496
-; CHECK-NEXT: ld1 { v2.b }[3], [x8]
-; CHECK-NEXT: ld1 { v4.b }[2], [x9]
-; CHECK-NEXT: add x8, sp, #432
-; CHECK-NEXT: ld1 { v3.b }[2], [x12]
-; CHECK-NEXT: add x13, sp, #48
-; CHECK-NEXT: ld1 { v5.b }[2], [x8]
-; CHECK-NEXT: mov v0.b[3], w3
-; CHECK-NEXT: add x10, sp, #112
-; CHECK-NEXT: add x8, sp, #504
-; CHECK-NEXT: ld1 { v2.b }[4], [x13]
-; CHECK-NEXT: add x13, sp, #376
-; CHECK-NEXT: ld1 { v1.b }[4], [x10]
-; CHECK-NEXT: ld1 { v4.b }[3], [x13]
-; CHECK-NEXT: add x13, sp, #440
-; CHECK-NEXT: ld1 { v3.b }[3], [x8]
-; CHECK-NEXT: ld1 { v5.b }[3], [x13]
-; CHECK-NEXT: add x11, sp, #120
-; CHECK-NEXT: add x8, sp, #56
-; CHECK-NEXT: mov v0.b[4], w4
-; CHECK-NEXT: add x13, sp, #512
-; CHECK-NEXT: ld1 { v1.b }[5], [x11]
-; CHECK-NEXT: ld1 { v2.b }[5], [x8]
-; CHECK-NEXT: add x8, sp, #384
-; CHECK-NEXT: add x11, sp, #448
-; CHECK-NEXT: ld1 { v3.b }[4], [x13]
-; CHECK-NEXT: ld1 { v4.b }[4], [x8]
-; CHECK-NEXT: ld1 { v5.b }[4], [x11]
-; CHECK-NEXT: add x12, sp, #128
-; CHECK-NEXT: add x10, sp, #64
-; CHECK-NEXT: add x8, sp, #520
-; CHECK-NEXT: mov v0.b[5], w5
-; CHECK-NEXT: ld1 { v1.b }[6], [x12]
-; CHECK-NEXT: ld1 { v2.b }[6], [x10]
-; CHECK-NEXT: add x10, sp, #392
-; CHECK-NEXT: add x11, sp, #456
-; CHECK-NEXT: ldr b6, [sp, #144]
-; CHECK-NEXT: ldr b7, [sp, #544]
-; CHECK-NEXT: ld1 { v3.b }[5], [x8]
-; CHECK-NEXT: ld1 { v4.b }[5], [x10]
-; CHECK-NEXT: ld1 { v5.b }[5], [x11]
-; CHECK-NEXT: add x9, sp, #136
-; CHECK-NEXT: sshll v6.8h, v6.8b, #0
-; CHECK-NEXT: mov v0.b[6], w6
-; CHECK-NEXT: ld1 { v1.b }[7], [x9]
-; CHECK-NEXT: add x8, sp, #528
-; CHECK-NEXT: add x9, sp, #400
-; CHECK-NEXT: add x10, sp, #464
-; CHECK-NEXT: sshll v7.8h, v7.8b, #0
-; CHECK-NEXT: ld1 { v3.b }[6], [x8]
-; CHECK-NEXT: ld1 { v4.b }[6], [x9]
-; CHECK-NEXT: ld1 { v5.b }[6], [x10]
-; CHECK-NEXT: movi v16.2d, #0000000000000000
-; CHECK-NEXT: movi v17.2d, #0000000000000000
-; CHECK-NEXT: add x14, sp, #72
-; CHECK-NEXT: mov v0.b[7], w7
-; CHECK-NEXT: sshll v6.4s, v6.4h, #0
-; CHECK-NEXT: add x8, sp, #536
-; CHECK-NEXT: add x9, sp, #408
-; CHECK-NEXT: add x10, sp, #472
-; CHECK-NEXT: sshll v7.4s, v7.4h, #0
-; CHECK-NEXT: ld1 { v2.b }[7], [x14]
-; CHECK-NEXT: ld1 { v3.b }[7], [x8]
-; CHECK-NEXT: ld1 { v4.b }[7], [x9]
-; CHECK-NEXT: ld1 { v5.b }[7], [x10]
-; CHECK-NEXT: mov v16.s[0], v6.s[0]
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-NEXT: mov v17.s[0], v7.s[0]
-; CHECK-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: sshll v2.8h, v2.8b, #0
-; CHECK-NEXT: sshll v3.8h, v3.8b, #0
-; CHECK-NEXT: sshll v4.8h, v4.8b, #0
-; CHECK-NEXT: sshll v5.8h, v5.8b, #0
-; CHECK-NEXT: saddl v7.4s, v0.4h, v1.4h
-; CHECK-NEXT: saddl2 v0.4s, v0.8h, v1.8h
-; CHECK-NEXT: saddw v6.4s, v16.4s, v2.4h
-; CHECK-NEXT: saddl v1.4s, v4.4h, v3.4h
-; CHECK-NEXT: saddl2 v3.4s, v4.8h, v3.8h
-; CHECK-NEXT: saddw v4.4s, v17.4s, v5.4h
-; CHECK-NEXT: saddw2 v0.4s, v0.4s, v2.8h
-; CHECK-NEXT: add v6.4s, v7.4s, v6.4s
-; CHECK-NEXT: saddw2 v2.4s, v3.4s, v5.8h
-; CHECK-NEXT: add v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: add v0.4s, v6.4s, v0.4s
-; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT: ret
-entry:
- %az = sext <25 x i8> %a to <25 x i32>
- %r1 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %az)
- %cz = sext <25 x i8> %c to <25 x i32>
- %r2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %cz)
- %x = add i32 %r1, %r2
- ret i32 %x
-}
-
-define i32 @test_udot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
-; CHECK-SD-LABEL: test_udot_v32i8:
+; CHECK-SD-LABEL: test_sdot_v25i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
-; CHECK-SD-NEXT: ldp q1, q3, [x0]
-; CHECK-SD-NEXT: ldp q2, q4, [x1]
-; CHECK-SD-NEXT: udot v0.4s, v4.16b, v3.16b
-; CHECK-SD-NEXT: udot v0.4s, v2.16b, v1.16b
+; CHECK-SD-NEXT: ldp q3, q0, [x1]
+; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT: ldp q2, q1, [x0]
+; CHECK-SD-NEXT: smull2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: smull v1.8h, v3.8b, v2.8b
+; CHECK-SD-NEXT: smull2 v2.8h, v3.16b, v2.16b
+; CHECK-SD-NEXT: sshll v3.4s, v4.4h, #0
+; CHECK-SD-NEXT: saddl2 v4.4s, v1.8h, v0.8h
+; CHECK-SD-NEXT: saddl v0.4s, v1.4h, v0.4h
+; CHECK-SD-NEXT: mov v5.s[0], v3.s[0]
+; CHECK-SD-NEXT: saddw2 v1.4s, v4.4s, v2.8h
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: saddw v2.4s, v5.4s, v2.4h
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w0, w8, w2
; CHECK-SD-NEXT: ret
;
-; CHECK-GI-LABEL: test_udot_v32i8:
+; CHECK-GI-LABEL: test_sdot_v25i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
-; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
-; CHECK-GI-NEXT: ldp q2, q3, [x0]
-; CHECK-GI-NEXT: ldp q4, q5, [x1]
-; CHECK-GI-NEXT: udot v1.4s, v4.16b, v2.16b
-; CHECK-GI-NEXT: udot v0.4s, v5.16b, v3.16b
-; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: ldrsb w11, [x1, #16]!
+; CHECK-GI-NEXT: ldrsb w12, [x1, #4]
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: mov v23.s[0], wzr
+; CHECK-GI-NEXT: smov w9, v1.b[4]
+; CHECK-GI-NEXT: smov w10, v1.b[12]
+; CHECK-GI-NEXT: smov w13, v1.b[0]
+; CHECK-GI-NEXT: smov w14, v1.b[5]
+; CHECK-GI-NEXT: mov v5.s[0], w12
+; CHECK-GI-NEXT: mov v3.s[0], w11
+; CHECK-GI-NEXT: smov w11, v0.b[0]
+; CHECK-GI-NEXT: smov w12, v1.b[1]
+; CHECK-GI-NEXT: smov w15, v1.b[8]
+; CHECK-GI-NEXT: ldrsb w8, [x0, #16]!
+; CHECK-GI-NEXT: mov v23.s[1], wzr
+; CHECK-GI-NEXT: mov v2.s[0], w9
+; CHECK-GI-NEXT: mov v4.s[0], w10
+; CHECK-GI-NEXT: smov w10, v1.b[13]
+; CHECK-GI-NEXT: ldrsb w9, [x1, #5]
+; CHECK-GI-NEXT: mov v6.s[0], w13
+; CHECK-GI-NEXT: smov w13, v1.b[6]
+; CHECK-GI-NEXT: mov v16.s[0], w11
+; CHECK-GI-NEXT: smov w11, v1.b[2]
+; CHECK-GI-NEXT: mov v7.s[0], w15
+; CHECK-GI-NEXT: mov v5.s[1], w9
+; CHECK-GI-NEXT: ldrsb w9, [x1, #6]
+; CHECK-GI-NEXT: smov w15, v1.b[9]
+; CHECK-GI-NEXT: mov v2.s[1], w14
+; CHECK-GI-NEXT: ldrsb w14, [x1, #1]
+; CHECK-GI-NEXT: mov v4.s[1], w10
+; CHECK-GI-NEXT: smov w10, v1.b[14]
+; CHECK-GI-NEXT: mov v6.s[1], w12
+; CHECK-GI-NEXT: smov w12, v0.b[1]
+; CHECK-GI-NEXT: mov v3.s[1], w14
+; CHECK-GI-NEXT: smov w14, v0.b[12]
+; CHECK-GI-NEXT: mov v21.s[0], w8
+; CHECK-GI-NEXT: ldrsb w8, [x0, #1]
+; CHECK-GI-NEXT: mov v5.s[2], w9
+; CHECK-GI-NEXT: smov w9, v0.b[4]
+; CHECK-GI-NEXT: mov v2.s[2], w13
+; CHECK-GI-NEXT: smov w13, v1.b[7]
+; CHECK-GI-NEXT: mov v7.s[1], w15
+; CHECK-GI-NEXT: mov v4.s[2], w10
+; CHECK-GI-NEXT: smov w10, v1.b[15]
+; CHECK-GI-NEXT: mov v16.s[1], w12
+; CHECK-GI-NEXT: ldrsb w12, [x1, #2]
+; CHECK-GI-NEXT: mov v6.s[2], w11
+; CHECK-GI-NEXT: smov w11, v0.b[2]
+; CHECK-GI-NEXT: mov v17.s[0], w9
+; CHECK-GI-NEXT: smov w9, v0.b[8]
+; CHECK-GI-NEXT: mov v18.s[0], w14
+; CHECK-GI-NEXT: mov v2.s[3], w13
+; CHECK-GI-NEXT: ldrsb w13, [x1, #7]
+; CHECK-GI-NEXT: mov v3.s[2], w12
+; CHECK-GI-NEXT: ldrsb w12, [x0, #4]
+; CHECK-GI-NEXT: mov v4.s[3], w10
+; CHECK-GI-NEXT: smov w10, v0.b[5]
+; CHECK-GI-NEXT: mov v5.s[3], w13
+; CHECK-GI-NEXT: ldrsb w13, [x0, #8]
+; CHECK-GI-NEXT: mov v16.s[2], w11
+; CHECK-GI-NEXT: smov w11, v0.b[13]
+; CHECK-GI-NEXT: mov v20.s[0], w12
+; CHECK-GI-NEXT: ldrsb w12, [x1, #8]
+; CHECK-GI-NEXT: mov v19.s[0], w9
+; CHECK-GI-NEXT: smov w9, v0.b[6]
+; CHECK-GI-NEXT: smov w15, v1.b[10]
+; CHECK-GI-NEXT: mul w12, w12, w13
+; CHECK-GI-NEXT: mov v17.s[1], w10
+; CHECK-GI-NEXT: ldrsb w10, [x0, #5]
+; CHECK-GI-NEXT: smov w13, v0.b[9]
+; CHECK-GI-NEXT: mov v21.s[1], w8
+; CHECK-GI-NEXT: smov w8, v1.b[11]
+; CHECK-GI-NEXT: mov v18.s[1], w11
+; CHECK-GI-NEXT: smov w11, v0.b[14]
+; CHECK-GI-NEXT: mov v20.s[1], w10
+; CHECK-GI-NEXT: ldrsb w10, [x0, #6]
+; CHECK-GI-NEXT: mov v22.s[0], w12
+; CHECK-GI-NEXT: smov w12, v0.b[7]
+; CHECK-GI-NEXT: mov v17.s[2], w9
+; CHECK-GI-NEXT: smov w9, v0.b[10]
+; CHECK-GI-NEXT: mov v7.s[2], w15
+; CHECK-GI-NEXT: mov v19.s[1], w13
+; CHECK-GI-NEXT: smov w13, v1.b[3]
+; CHECK-GI-NEXT: mov v23.s[2], wzr
+; CHECK-GI-NEXT: mov v18.s[2], w11
+; CHECK-GI-NEXT: smov w11, v0.b[15]
+; CHECK-GI-NEXT: mov v20.s[2], w10
+; CHECK-GI-NEXT: ldrsb w10, [x0, #2]
+; CHECK-GI-NEXT: mov v22.s[1], wzr
+; CHECK-GI-NEXT: mov v17.s[3], w12
+; CHECK-GI-NEXT: ldrsb w12, [x0, #7]
+; CHECK-GI-NEXT: mov v7.s[3], w8
+; CHECK-GI-NEXT: ldrsb w8, [x0, #3]
+; CHECK-GI-NEXT: mov v19.s[2], w9
+; CHECK-GI-NEXT: smov w9, v0.b[3]
+; CHECK-GI-NEXT: mov v18.s[3], w11
+; CHECK-GI-NEXT: smov w11, v0.b[11]
+; CHECK-GI-NEXT: mov v21.s[2], w10
+; CHECK-GI-NEXT: ldrsb w10, [x1, #3]
+; CHECK-GI-NEXT: mov v20.s[3], w12
+; CHECK-GI-NEXT: mov v22.s[2], wzr
+; CHECK-GI-NEXT: mov v6.s[3], w13
+; CHECK-GI-NEXT: mul v0.4s, v2.4s, v17.4s
+; CHECK-GI-NEXT: mov v23.s[3], wzr
+; CHECK-GI-NEXT: mov v3.s[3], w10
+; CHECK-GI-NEXT: mov v16.s[3], w9
+; CHECK-GI-NEXT: mov v19.s[3], w11
+; CHECK-GI-NEXT: mul v1.4s, v4.4s, v18.4s
+; CHECK-GI-NEXT: mov v21.s[3], w8
+; CHECK-GI-NEXT: mul v2.4s, v5.4s, v20.4s
+; CHECK-GI-NEXT: mov v22.s[3], wzr
+; CHECK-GI-NEXT: mla v0.4s, v6.4s, v16.4s
+; CHECK-GI-NEXT: mla v1.4s, v7.4s, v19.4s
+; CHECK-GI-NEXT: mla v2.4s, v3.4s, v21.4s
+; CHECK-GI-NEXT: add v3.4s, v22.4s, v23.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: addv s0, v0.4s
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: add w0, w8, w2
; CHECK-GI-NEXT: ret
entry:
- %0 = load <32 x i8>, ptr %a
- %1 = zext <32 x i8> %0 to <32 x i32>
- %2 = load <32 x i8>, ptr %b
- %3 = zext <32 x i8> %2 to <32 x i32>
- %4 = mul nuw nsw <32 x i32> %3, %1
- %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
- %op.extra = add i32 %5, %sum
+ %0 = load <25 x i8>, ptr %a
+ %1 = sext <25 x i8> %0 to <25 x i32>
+ %2 = load <25 x i8>, ptr %b
+ %3 = sext <25 x i8> %2 to <25 x i32>
+ %4 = mul nsw <25 x i32> %3, %1
+ %5 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %4)
+ %op.extra = add nsw i32 %5, %sum
ret i32 %op.extra
}
-define i32 @test_udot_v32i8_nomla(ptr nocapture readonly %a1) {
-; CHECK-SD-LABEL: test_udot_v32i8_nomla:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v0.16b, #1
-; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT: ldp q2, q3, [x0]
-; CHECK-SD-NEXT: udot v1.4s, v3.16b, v0.16b
-; CHECK-SD-NEXT: udot v1.4s, v2.16b, v0.16b
-; CHECK-SD-NEXT: addv s0, v1.4s
-; CHECK-SD-NEXT: fmov w0, s0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_udot_v32i8_nomla:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: movi v0.16b, #1
-; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
-; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
-; CHECK-GI-NEXT: ldp q3, q4, [x0]
-; CHECK-GI-NEXT: udot v2.4s, v3.16b, v0.16b
-; CHECK-GI-NEXT: udot v1.4s, v4.16b, v0.16b
-; CHECK-GI-NEXT: add v0.4s, v2.4s, v1.4s
-; CHECK-GI-NEXT: addv s0, v0.4s
-; CHECK-GI-NEXT: fmov w0, s0
-; CHECK-GI-NEXT: ret
-entry:
- %0 = load <32 x i8>, ptr %a1
- %1 = zext <32 x i8> %0 to <32 x i32>
- %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
- ret i32 %2
-}
-define i32 @test_sdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
-; CHECK-SD-LABEL: test_sdot_v32i8:
+define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 x i8> %d) {
+; CHECK-SD-LABEL: test_sdot_v25i8_double:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: .cfi_offset w29, -16
+; CHECK-SD-NEXT: ldr b0, [sp, #216]
+; CHECK-SD-NEXT: add x8, sp, #224
+; CHECK-SD-NEXT: ldr b1, [sp, #16]
+; CHECK-SD-NEXT: ldr b2, [sp, #280]
+; CHECK-SD-NEXT: add x9, sp, #240
+; CHECK-SD-NEXT: ldr b4, [sp, #80]
+; CHECK-SD-NEXT: ld1 { v0.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #24
+; CHECK-SD-NEXT: add x10, sp, #48
+; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #232
+; CHECK-SD-NEXT: add x11, sp, #96
+; CHECK-SD-NEXT: ldr b5, [sp, #152]
+; CHECK-SD-NEXT: add x12, sp, #168
+; CHECK-SD-NEXT: ldr b6, [sp, #616]
+; CHECK-SD-NEXT: ld1 { v0.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #32
+; CHECK-SD-NEXT: fmov s3, w0
+; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #288
+; CHECK-SD-NEXT: ldr b7, [sp, #416]
+; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #40
+; CHECK-SD-NEXT: ldr b22, [sp, #744]
+; CHECK-SD-NEXT: ld1 { v0.b }[3], [x9]
+; CHECK-SD-NEXT: add x9, sp, #248
+; CHECK-SD-NEXT: mov v3.b[1], w1
+; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8]
+; CHECK-SD-NEXT: add x8, sp, #88
+; CHECK-SD-NEXT: ldr b23, [sp, #544]
+; CHECK-SD-NEXT: ld1 { v4.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #256
+; CHECK-SD-NEXT: ldr b19, [sp, #680]
+; CHECK-SD-NEXT: ld1 { v0.b }[4], [x9]
+; CHECK-SD-NEXT: add x9, sp, #296
+; CHECK-SD-NEXT: ldr b20, [sp, #480]
+; CHECK-SD-NEXT: ld1 { v1.b }[4], [x10]
+; CHECK-SD-NEXT: ld1 { v2.b }[2], [x9]
+; CHECK-SD-NEXT: add x10, sp, #160
+; CHECK-SD-NEXT: ld1 { v4.b }[2], [x11]
+; CHECK-SD-NEXT: add x11, sp, #304
+; CHECK-SD-NEXT: ld1 { v5.b }[1], [x10]
+; CHECK-SD-NEXT: ld1 { v0.b }[5], [x8]
+; CHECK-SD-NEXT: add x8, sp, #56
+; CHECK-SD-NEXT: add x10, sp, #264
+; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8]
+; CHECK-SD-NEXT: add x8, sp, #64
+; CHECK-SD-NEXT: ld1 { v2.b }[3], [x11]
+; CHECK-SD-NEXT: add x9, sp, #272
+; CHECK-SD-NEXT: ld1 { v5.b }[2], [x12]
+; CHECK-SD-NEXT: add x11, sp, #72
+; CHECK-SD-NEXT: ld1 { v0.b }[6], [x10]
+; CHECK-SD-NEXT: add x10, sp, #312
+; CHECK-SD-NEXT: mov v3.b[2], w2
+; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8]
+; CHECK-SD-NEXT: add x8, sp, #104
+; CHECK-SD-NEXT: ld1 { v2.b }[4], [x10]
+; CHECK-SD-NEXT: ld1 { v4.b }[3], [x8]
+; CHECK-SD-NEXT: add x8, sp, #112
+; CHECK-SD-NEXT: add x10, sp, #128
+; CHECK-SD-NEXT: ld1 { v0.b }[7], [x9]
+; CHECK-SD-NEXT: add x9, sp, #320
+; CHECK-SD-NEXT: ldr b21, [sp, #552]
+; CHECK-SD-NEXT: ld1 { v2.b }[5], [x9]
+; CHECK-SD-NEXT: add x9, sp, #176
+; CHECK-SD-NEXT: ld1 { v1.b }[7], [x11]
+; CHECK-SD-NEXT: ld1 { v4.b }[4], [x8]
+; CHECK-SD-NEXT: add x8, sp, #624
+; CHECK-SD-NEXT: ld1 { v5.b }[3], [x9]
+; CHECK-SD-NEXT: ld1 { v6.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #120
+; CHECK-SD-NEXT: add x9, sp, #328
+; CHECK-SD-NEXT: ld1 { v2.b }[6], [x9]
+; CHECK-SD-NEXT: add x9, sp, #184
+; CHECK-SD-NEXT: add x11, sp, #192
+; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8]
+; CHECK-SD-NEXT: add x8, sp, #632
+; CHECK-SD-NEXT: ld1 { v5.b }[4], [x9]
+; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8]
+; CHECK-SD-NEXT: add x9, sp, #640
+; CHECK-SD-NEXT: add x8, sp, #336
+; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8]
+; CHECK-SD-NEXT: add x8, sp, #656
+; CHECK-SD-NEXT: smull v23.8h, v23.8b, v22.8b
+; CHECK-SD-NEXT: ld1 { v5.b }[5], [x11]
+; CHECK-SD-NEXT: add x11, sp, #648
+; CHECK-SD-NEXT: ld1 { v4.b }[6], [x10]
+; CHECK-SD-NEXT: ld1 { v6.b }[3], [x9]
+; CHECK-SD-NEXT: add x9, sp, #200
+; CHECK-SD-NEXT: add x10, sp, #136
+; CHECK-SD-NEXT: ldr b22, [sp, #352]
+; CHECK-SD-NEXT: add x12, sp, #360
+; CHECK-SD-NEXT: mov v3.b[3], w3
+; CHECK-SD-NEXT: ld1 { v5.b }[6], [x9]
+; CHECK-SD-NEXT: add x9, sp, #208
+; CHECK-SD-NEXT: ld1 { v4.b }[7], [x10]
+; CHECK-SD-NEXT: ld1 { v6.b }[4], [x11]
+; CHECK-SD-NEXT: add x11, sp, #424
+; CHECK-SD-NEXT: add x10, sp, #488
+; CHECK-SD-NEXT: ld1 { v7.b }[1], [x11]
+; CHECK-SD-NEXT: add x11, sp, #560
+; CHECK-SD-NEXT: ld1 { v20.b }[1], [x10]
+; CHECK-SD-NEXT: ld1 { v5.b }[7], [x9]
+; CHECK-SD-NEXT: add x9, sp, #440
+; CHECK-SD-NEXT: ld1 { v21.b }[1], [x11]
+; CHECK-SD-NEXT: ld1 { v6.b }[5], [x8]
+; CHECK-SD-NEXT: add x8, sp, #432
+; CHECK-SD-NEXT: ld1 { v22.b }[1], [x12]
+; CHECK-SD-NEXT: ld1 { v7.b }[2], [x8]
+; CHECK-SD-NEXT: add x11, sp, #496
+; CHECK-SD-NEXT: add x12, sp, #568
+; CHECK-SD-NEXT: add x13, sp, #368
+; CHECK-SD-NEXT: ld1 { v20.b }[2], [x11]
+; CHECK-SD-NEXT: ld1 { v21.b }[2], [x12]
+; CHECK-SD-NEXT: ld1 { v22.b }[2], [x13]
+; CHECK-SD-NEXT: add x10, sp, #448
+; CHECK-SD-NEXT: mov v3.b[4], w4
+; CHECK-SD-NEXT: ld1 { v7.b }[3], [x9]
+; CHECK-SD-NEXT: add x9, sp, #688
+; CHECK-SD-NEXT: add x11, sp, #576
+; CHECK-SD-NEXT: ld1 { v19.b }[1], [x9]
+; CHECK-SD-NEXT: add x9, sp, #696
+; CHECK-SD-NEXT: add x12, sp, #376
+; CHECK-SD-NEXT: ld1 { v21.b }[3], [x11]
+; CHECK-SD-NEXT: ld1 { v22.b }[3], [x12]
+; CHECK-SD-NEXT: add x11, sp, #512
+; CHECK-SD-NEXT: ld1 { v7.b }[4], [x10]
+; CHECK-SD-NEXT: add x10, sp, #504
+; CHECK-SD-NEXT: add x12, sp, #584
+; CHECK-SD-NEXT: ld1 { v19.b }[2], [x9]
+; CHECK-SD-NEXT: add x9, sp, #704
+; CHECK-SD-NEXT: ld1 { v20.b }[3], [x10]
+; CHECK-SD-NEXT: add x13, sp, #384
+; CHECK-SD-NEXT: mov v3.b[5], w5
+; CHECK-SD-NEXT: ld1 { v21.b }[4], [x12]
+; CHECK-SD-NEXT: ld1 { v22.b }[4], [x13]
+; CHECK-SD-NEXT: add x10, sp, #456
+; CHECK-SD-NEXT: ldr b16, [sp, #344]
+; CHECK-SD-NEXT: ld1 { v19.b }[3], [x9]
+; CHECK-SD-NEXT: add x9, sp, #712
+; CHECK-SD-NEXT: ld1 { v20.b }[4], [x11]
+; CHECK-SD-NEXT: ldr b17, [sp, #144]
+; CHECK-SD-NEXT: ld1 { v7.b }[5], [x10]
+; CHECK-SD-NEXT: add x10, sp, #520
+; CHECK-SD-NEXT: add x11, sp, #592
+; CHECK-SD-NEXT: add x12, sp, #392
+; CHECK-SD-NEXT: mov v3.b[6], w6
+; CHECK-SD-NEXT: ld1 { v19.b }[4], [x9]
+; CHECK-SD-NEXT: add x9, sp, #720
+; CHECK-SD-NEXT: ld1 { v20.b }[5], [x10]
+; CHECK-SD-NEXT: ld1 { v21.b }[5], [x11]
+; CHECK-SD-NEXT: ld1 { v22.b }[5], [x12]
+; CHECK-SD-NEXT: smull v16.8h, v17.8b, v16.8b
+; CHECK-SD-NEXT: add x8, sp, #664
+; CHECK-SD-NEXT: add x10, sp, #464
+; CHECK-SD-NEXT: add x11, sp, #528
+; CHECK-SD-NEXT: ld1 { v19.b }[5], [x9]
+; CHECK-SD-NEXT: add x9, sp, #728
+; CHECK-SD-NEXT: add x12, sp, #600
+; CHECK-SD-NEXT: add x13, sp, #400
+; CHECK-SD-NEXT: ld1 { v6.b }[6], [x8]
+; CHECK-SD-NEXT: ld1 { v20.b }[6], [x11]
+; CHECK-SD-NEXT: ld1 { v21.b }[6], [x12]
+; CHECK-SD-NEXT: ld1 { v22.b }[6], [x13]
+; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10]
+; CHECK-SD-NEXT: ld1 { v19.b }[6], [x9]
+; CHECK-SD-NEXT: add x9, sp, #736
+; CHECK-SD-NEXT: mov v3.b[7], w7
+; CHECK-SD-NEXT: sshll v18.4s, v16.4h, #0
+; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
+; CHECK-SD-NEXT: add x8, sp, #672
+; CHECK-SD-NEXT: add x10, sp, #472
+; CHECK-SD-NEXT: add x11, sp, #608
+; CHECK-SD-NEXT: ld1 { v19.b }[7], [x9]
+; CHECK-SD-NEXT: add x9, sp, #536
+; CHECK-SD-NEXT: add x12, sp, #408
+; CHECK-SD-NEXT: ld1 { v20.b }[7], [x9]
+; CHECK-SD-NEXT: ld1 { v21.b }[7], [x11]
+; CHECK-SD-NEXT: ld1 { v22.b }[7], [x12]
+; CHECK-SD-NEXT: ld1 { v6.b }[7], [x8]
+; CHECK-SD-NEXT: ld1 { v7.b }[7], [x10]
+; CHECK-SD-NEXT: sshll v23.4s, v23.4h, #0
+; CHECK-SD-NEXT: smull v0.8h, v1.8b, v0.8b
+; CHECK-SD-NEXT: smull v1.8h, v4.8b, v2.8b
+; CHECK-SD-NEXT: smull v2.8h, v3.8b, v5.8b
+; CHECK-SD-NEXT: smull v3.8h, v20.8b, v19.8b
+; CHECK-SD-NEXT: smull v4.8h, v22.8b, v21.8b
+; CHECK-SD-NEXT: mov v17.s[0], v18.s[0]
+; CHECK-SD-NEXT: smull v5.8h, v7.8b, v6.8b
+; CHECK-SD-NEXT: mov v16.s[0], v23.s[0]
+; CHECK-SD-NEXT: saddl2 v6.4s, v2.8h, v1.8h
+; CHECK-SD-NEXT: saddl v1.4s, v2.4h, v1.4h
+; CHECK-SD-NEXT: saddl2 v2.4s, v4.8h, v3.8h
+; CHECK-SD-NEXT: saddl v3.4s, v4.4h, v3.4h
+; CHECK-SD-NEXT: saddw v4.4s, v17.4s, v0.4h
+; CHECK-SD-NEXT: saddw v7.4s, v16.4s, v5.4h
+; CHECK-SD-NEXT: saddw2 v0.4s, v6.4s, v0.8h
+; CHECK-SD-NEXT: add v1.4s, v1.4s, v4.4s
+; CHECK-SD-NEXT: saddw2 v2.4s, v2.4s, v5.8h
+; CHECK-SD-NEXT: add v3.4s, v3.4s, v7.4s
+; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: add v1.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_sdot_v25i8_double:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: str d10, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-GI-NEXT: stp d9, d8, [sp, #8] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x29, [sp, #24] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
+; CHECK-GI-NEXT: .cfi_offset w29, -8
+; CHECK-GI-NEXT: .cfi_offset b8, -16
+; CHECK-GI-NEXT: .cfi_offset b9, -24
+; CHECK-GI-NEXT: .cfi_offset b10, -32
+; CHECK-GI-NEXT: ldr w8, [sp, #32]
+; CHECK-GI-NEXT: sxtb w9, w0
+; CHECK-GI-NEXT: sxtb w10, w4
+; CHECK-GI-NEXT: sxtb w11, w5
+; CHECK-GI-NEXT: sxtb w12, w3
+; CHECK-GI-NEXT: sxtb w13, w7
+; CHECK-GI-NEXT: mov v0.s[0], w9
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v2.s[0], w10
+; CHECK-GI-NEXT: ldr w9, [sp, #40]
+; CHECK-GI-NEXT: sxtb w10, w1
+; CHECK-GI-NEXT: ldr w14, [sp, #152]
+; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #64]
+; CHECK-GI-NEXT: mov v9.s[0], wzr
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr x29, [sp, #24] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.s[1], w10
+; CHECK-GI-NEXT: mov v2.s[1], w11
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: sxtb w10, w2
+; CHECK-GI-NEXT: sxtb w11, w6
+; CHECK-GI-NEXT: mov v1.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #48]
+; CHECK-GI-NEXT: mov v3.s[0], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #72]
+; CHECK-GI-NEXT: mov v9.s[1], wzr
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v0.s[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #56]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v2.s[2], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #80]
+; CHECK-GI-NEXT: mov v1.s[2], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #96]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v3.s[1], w8
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: ldr w8, [sp, #88]
+; CHECK-GI-NEXT: mov v0.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #128]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v2.s[3], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #168]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v1.s[3], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #104]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v4.s[0], w9
+; CHECK-GI-NEXT: mov v3.s[2], w11
+; CHECK-GI-NEXT: sxtb w11, w13
+; CHECK-GI-NEXT: ldr w13, [sp, #136]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v5.s[0], w12
+; CHECK-GI-NEXT: ldr w9, [sp, #112]
+; CHECK-GI-NEXT: ldr w12, [sp, #176]
+; CHECK-GI-NEXT: mov v6.s[0], w11
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: ldr w11, [sp, #120]
+; CHECK-GI-NEXT: mov v9.s[2], wzr
+; CHECK-GI-NEXT: mov v4.s[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #200]
+; CHECK-GI-NEXT: mov v3.s[3], w8
+; CHECK-GI-NEXT: sxtb w8, w12
+; CHECK-GI-NEXT: ldr w12, [sp, #144]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v5.s[1], w13
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w13, [sp, #184]
+; CHECK-GI-NEXT: mov v6.s[1], w8
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: ldr w8, [sp, #160]
+; CHECK-GI-NEXT: mov v4.s[2], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #208]
+; CHECK-GI-NEXT: mov v7.s[0], w10
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: sxtb w10, w11
+; CHECK-GI-NEXT: ldr w11, [sp, #192]
+; CHECK-GI-NEXT: mov v5.s[2], w12
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr w12, [sp, #232]
+; CHECK-GI-NEXT: mov v6.s[2], w13
+; CHECK-GI-NEXT: sxtb w13, w14
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v4.s[3], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #216]
+; CHECK-GI-NEXT: mov v7.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #264]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: ldr w14, [sp, #368]
+; CHECK-GI-NEXT: mov v5.s[3], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #296]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v6.s[3], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #240]
+; CHECK-GI-NEXT: mov v16.s[0], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #224]
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v7.s[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #272]
+; CHECK-GI-NEXT: mov v18.s[0], w9
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: ldr w9, [sp, #304]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v17.s[0], w13
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w13, [sp, #248]
+; CHECK-GI-NEXT: mov v16.s[1], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #328]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v7.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #280]
+; CHECK-GI-NEXT: mov v18.s[1], w10
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: ldr w10, [sp, #312]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v17.s[1], w9
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v16.s[2], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #336]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v19.s[0], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #288]
+; CHECK-GI-NEXT: mov v18.s[2], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #320]
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: ldr w9, [sp, #256]
+; CHECK-GI-NEXT: mov v17.s[2], w10
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: ldr w10, [sp, #344]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mul v2.4s, v2.4s, v7.4s
+; CHECK-GI-NEXT: mov v19.s[1], w13
+; CHECK-GI-NEXT: mov v18.s[3], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #360]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w13, [sp, #400]
+; CHECK-GI-NEXT: mov v16.s[3], w9
+; CHECK-GI-NEXT: mov v17.s[3], w12
+; CHECK-GI-NEXT: sxtb w12, w14
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: ldr w9, [sp, #352]
+; CHECK-GI-NEXT: mov v9.s[3], wzr
+; CHECK-GI-NEXT: mla v2.4s, v0.4s, v6.4s
+; CHECK-GI-NEXT: mov v19.s[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #376]
+; CHECK-GI-NEXT: mov v20.s[0], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #432]
+; CHECK-GI-NEXT: mul w8, w8, w11
+; CHECK-GI-NEXT: sxtb w11, w13
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w13, [sp, #384]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v23.s[0], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #408]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v20.s[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #440]
+; CHECK-GI-NEXT: mov v21.s[0], w8
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v22.s[0], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #464]
+; CHECK-GI-NEXT: sxtb w8, w13
+; CHECK-GI-NEXT: ldr w13, [sp, #416]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v23.s[1], w11
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v19.s[3], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #392]
+; CHECK-GI-NEXT: mov v20.s[2], w8
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v22.s[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #472]
+; CHECK-GI-NEXT: mov v24.s[0], w12
+; CHECK-GI-NEXT: ldr w11, [sp, #424]
+; CHECK-GI-NEXT: ldr w8, [sp, #448]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v23.s[2], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #496]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v20.s[3], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #480]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v24.s[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #528]
+; CHECK-GI-NEXT: ldr w12, [sp, #456]
+; CHECK-GI-NEXT: mov v22.s[2], w8
+; CHECK-GI-NEXT: mov v23.s[3], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #504]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v25.s[0], w13
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w8, [sp, #488]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v24.s[2], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #536]
+; CHECK-GI-NEXT: mov v26.s[0], w10
+; CHECK-GI-NEXT: ldr w13, [sp, #568]
+; CHECK-GI-NEXT: mov v22.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #512]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v25.s[1], w11
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: sxtb w10, w13
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v24.s[3], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #544]
+; CHECK-GI-NEXT: mov v26.s[1], w9
+; CHECK-GI-NEXT: ldr w13, [sp, #520]
+; CHECK-GI-NEXT: ldr w11, [sp, #576]
+; CHECK-GI-NEXT: mov v27.s[0], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #600]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v25.s[2], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #584]
+; CHECK-GI-NEXT: sxtb w9, w11
+; CHECK-GI-NEXT: ldr w11, [sp, #552]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v26.s[2], w8
+; CHECK-GI-NEXT: sxtb w8, w12
+; CHECK-GI-NEXT: ldr w12, [sp, #664]
+; CHECK-GI-NEXT: mov v27.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #608]
+; CHECK-GI-NEXT: mov v28.s[0], w10
+; CHECK-GI-NEXT: mov v25.s[3], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #592]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr w10, [sp, #560]
+; CHECK-GI-NEXT: mov v26.s[3], w11
+; CHECK-GI-NEXT: sxtb w11, w13
+; CHECK-GI-NEXT: ldr w13, [sp, #672]
+; CHECK-GI-NEXT: mov v30.s[0], w12
+; CHECK-GI-NEXT: mov v27.s[2], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #616]
+; CHECK-GI-NEXT: mov v28.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #632]
+; CHECK-GI-NEXT: ldr w12, [sp, #728]
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v21.s[1], wzr
+; CHECK-GI-NEXT: mov v30.s[1], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #760]
+; CHECK-GI-NEXT: mov v27.s[3], w11
+; CHECK-GI-NEXT: mov v28.s[2], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #696]
+; CHECK-GI-NEXT: mov v29.s[0], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #736]
+; CHECK-GI-NEXT: mov v8.s[0], w12
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: ldr w11, [sp, #640]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: ldr w12, [sp, #680]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mul w10, w10, w13
+; CHECK-GI-NEXT: mov v21.s[2], wzr
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v31.s[0], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #704]
+; CHECK-GI-NEXT: mov v8.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #744]
+; CHECK-GI-NEXT: mul v3.4s, v3.4s, v18.4s
+; CHECK-GI-NEXT: mov v10.s[0], w10
+; CHECK-GI-NEXT: mov v29.s[1], w11
+; CHECK-GI-NEXT: sxtb w11, w12
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr w12, [sp, #624]
+; CHECK-GI-NEXT: mov v30.s[2], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #648]
+; CHECK-GI-NEXT: ldr w10, [sp, #688]
+; CHECK-GI-NEXT: mov v31.s[1], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #712]
+; CHECK-GI-NEXT: mov v8.s[2], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #752]
+; CHECK-GI-NEXT: mov v10.s[1], wzr
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v28.s[3], w12
+; CHECK-GI-NEXT: mul v5.4s, v5.4s, v19.4s
+; CHECK-GI-NEXT: mov v29.s[2], w11
+; CHECK-GI-NEXT: mov v30.s[3], w10
+; CHECK-GI-NEXT: mov v31.s[2], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #656]
+; CHECK-GI-NEXT: mov v8.s[3], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #720]
+; CHECK-GI-NEXT: mov v10.s[2], wzr
+; CHECK-GI-NEXT: mov v21.s[3], wzr
+; CHECK-GI-NEXT: mla v3.4s, v1.4s, v16.4s
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mul v7.4s, v23.4s, v28.4s
+; CHECK-GI-NEXT: mul v18.4s, v24.4s, v30.4s
+; CHECK-GI-NEXT: mla v5.4s, v4.4s, v17.4s
+; CHECK-GI-NEXT: mul v19.4s, v26.4s, v8.4s
+; CHECK-GI-NEXT: mov v29.s[3], w8
+; CHECK-GI-NEXT: mov v31.s[3], w9
+; CHECK-GI-NEXT: mov v10.s[3], wzr
+; CHECK-GI-NEXT: add v0.4s, v21.4s, v9.4s
+; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: mla v7.4s, v20.4s, v27.4s
+; CHECK-GI-NEXT: mla v18.4s, v22.4s, v29.4s
+; CHECK-GI-NEXT: mla v19.4s, v25.4s, v31.4s
+; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s
+; CHECK-GI-NEXT: add v1.4s, v10.4s, v9.4s
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #8] // 16-byte Folded Reload
+; CHECK-GI-NEXT: add v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT: add v3.4s, v7.4s, v18.4s
+; CHECK-GI-NEXT: add v1.4s, v19.4s, v1.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: add v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: addv s1, v1.4s
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ldr d10, [sp], #32 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ret
+entry:
+ %az = sext <25 x i8> %a to <25 x i32>
+ %bz = sext <25 x i8> %b to <25 x i32>
+ %m1 = mul nuw nsw <25 x i32> %az, %bz
+ %r1 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %m1)
+ %cz = sext <25 x i8> %c to <25 x i32>
+ %dz = sext <25 x i8> %d to <25 x i32>
+ %m2 = mul nuw nsw <25 x i32> %cz, %dz
+ %r2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %m2)
+ %x = add i32 %r1, %r2
+ ret i32 %x
+}
+
+define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 x i8> %d) {
+; CHECK-SD-LABEL: test_sdot_v25i8_double_nomla:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: .cfi_offset w29, -16
+; CHECK-SD-NEXT: fmov s0, w0
+; CHECK-SD-NEXT: ldr b1, [sp, #80]
+; CHECK-SD-NEXT: add x10, sp, #88
+; CHECK-SD-NEXT: ldr b2, [sp, #16]
+; CHECK-SD-NEXT: add x9, sp, #96
+; CHECK-SD-NEXT: ldr b3, [sp, #480]
+; CHECK-SD-NEXT: ld1 { v1.b }[1], [x10]
+; CHECK-SD-NEXT: add x10, sp, #24
+; CHECK-SD-NEXT: ldr b4, [sp, #352]
+; CHECK-SD-NEXT: mov v0.b[1], w1
+; CHECK-SD-NEXT: ld1 { v2.b }[1], [x10]
+; CHECK-SD-NEXT: add x11, sp, #488
+; CHECK-SD-NEXT: add x10, sp, #360
+; CHECK-SD-NEXT: ldr b5, [sp, #416]
+; CHECK-SD-NEXT: add x8, sp, #104
+; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9]
+; CHECK-SD-NEXT: add x9, sp, #32
+; CHECK-SD-NEXT: ld1 { v3.b }[1], [x11]
+; CHECK-SD-NEXT: ld1 { v2.b }[2], [x9]
+; CHECK-SD-NEXT: add x11, sp, #424
+; CHECK-SD-NEXT: ld1 { v4.b }[1], [x10]
+; CHECK-SD-NEXT: mov v0.b[2], w2
+; CHECK-SD-NEXT: ld1 { v5.b }[1], [x11]
+; CHECK-SD-NEXT: add x9, sp, #368
+; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8]
+; CHECK-SD-NEXT: add x8, sp, #40
+; CHECK-SD-NEXT: add x12, sp, #496
+; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8]
+; CHECK-SD-NEXT: ld1 { v4.b }[2], [x9]
+; CHECK-SD-NEXT: add x8, sp, #432
+; CHECK-SD-NEXT: ld1 { v3.b }[2], [x12]
+; CHECK-SD-NEXT: add x13, sp, #48
+; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8]
+; CHECK-SD-NEXT: mov v0.b[3], w3
+; CHECK-SD-NEXT: add x10, sp, #112
+; CHECK-SD-NEXT: add x8, sp, #504
+; CHECK-SD-NEXT: ld1 { v2.b }[4], [x13]
+; CHECK-SD-NEXT: add x13, sp, #376
+; CHECK-SD-NEXT: ld1 { v1.b }[4], [x10]
+; CHECK-SD-NEXT: ld1 { v4.b }[3], [x13]
+; CHECK-SD-NEXT: add x13, sp, #440
+; CHECK-SD-NEXT: ld1 { v3.b }[3], [x8]
+; CHECK-SD-NEXT: ld1 { v5.b }[3], [x13]
+; CHECK-SD-NEXT: add x11, sp, #120
+; CHECK-SD-NEXT: add x8, sp, #56
+; CHECK-SD-NEXT: mov v0.b[4], w4
+; CHECK-SD-NEXT: add x13, sp, #512
+; CHECK-SD-NEXT: ld1 { v1.b }[5], [x11]
+; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8]
+; CHECK-SD-NEXT: add x8, sp, #384
+; CHECK-SD-NEXT: add x11, sp, #448
+; CHECK-SD-NEXT: ld1 { v3.b }[4], [x13]
+; CHECK-SD-NEXT: ld1 { v4.b }[4], [x8]
+; CHECK-SD-NEXT: ld1 { v5.b }[4], [x11]
+; CHECK-SD-NEXT: add x12, sp, #128
+; CHECK-SD-NEXT: add x10, sp, #64
+; CHECK-SD-NEXT: add x8, sp, #520
+; CHECK-SD-NEXT: mov v0.b[5], w5
+; CHECK-SD-NEXT: ld1 { v1.b }[6], [x12]
+; CHECK-SD-NEXT: ld1 { v2.b }[6], [x10]
+; CHECK-SD-NEXT: add x10, sp, #392
+; CHECK-SD-NEXT: add x11, sp, #456
+; CHECK-SD-NEXT: ldr b6, [sp, #144]
+; CHECK-SD-NEXT: ldr b7, [sp, #544]
+; CHECK-SD-NEXT: ld1 { v3.b }[5], [x8]
+; CHECK-SD-NEXT: ld1 { v4.b }[5], [x10]
+; CHECK-SD-NEXT: ld1 { v5.b }[5], [x11]
+; CHECK-SD-NEXT: add x9, sp, #136
+; CHECK-SD-NEXT: sshll v6.8h, v6.8b, #0
+; CHECK-SD-NEXT: mov v0.b[6], w6
+; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9]
+; CHECK-SD-NEXT: add x8, sp, #528
+; CHECK-SD-NEXT: add x9, sp, #400
+; CHECK-SD-NEXT: add x10, sp, #464
+; CHECK-SD-NEXT: sshll v7.8h, v7.8b, #0
+; CHECK-SD-NEXT: ld1 { v3.b }[6], [x8]
+; CHECK-SD-NEXT: ld1 { v4.b }[6], [x9]
+; CHECK-SD-NEXT: ld1 { v5.b }[6], [x10]
+; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
+; CHECK-SD-NEXT: add x14, sp, #72
+; CHECK-SD-NEXT: mov v0.b[7], w7
+; CHECK-SD-NEXT: sshll v6.4s, v6.4h, #0
+; CHECK-SD-NEXT: add x8, sp, #536
+; CHECK-SD-NEXT: add x9, sp, #408
+; CHECK-SD-NEXT: add x10, sp, #472
+; CHECK-SD-NEXT: sshll v7.4s, v7.4h, #0
+; CHECK-SD-NEXT: ld1 { v2.b }[7], [x14]
+; CHECK-SD-NEXT: ld1 { v3.b }[7], [x8]
+; CHECK-SD-NEXT: ld1 { v4.b }[7], [x9]
+; CHECK-SD-NEXT: ld1 { v5.b }[7], [x10]
+; CHECK-SD-NEXT: mov v16.s[0], v6.s[0]
+; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: mov v17.s[0], v7.s[0]
+; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: sshll v2.8h, v2.8b, #0
+; CHECK-SD-NEXT: sshll v3.8h, v3.8b, #0
+; CHECK-SD-NEXT: sshll v4.8h, v4.8b, #0
+; CHECK-SD-NEXT: sshll v5.8h, v5.8b, #0
+; CHECK-SD-NEXT: saddl v7.4s, v0.4h, v1.4h
+; CHECK-SD-NEXT: saddl2 v0.4s, v0.8h, v1.8h
+; CHECK-SD-NEXT: saddw v6.4s, v16.4s, v2.4h
+; CHECK-SD-NEXT: saddl v1.4s, v4.4h, v3.4h
+; CHECK-SD-NEXT: saddl2 v3.4s, v4.8h, v3.8h
+; CHECK-SD-NEXT: saddw v4.4s, v17.4s, v5.4h
+; CHECK-SD-NEXT: saddw2 v0.4s, v0.4s, v2.8h
+; CHECK-SD-NEXT: add v6.4s, v7.4s, v6.4s
+; CHECK-SD-NEXT: saddw2 v2.4s, v3.4s, v5.8h
+; CHECK-SD-NEXT: add v1.4s, v1.4s, v4.4s
+; CHECK-SD-NEXT: add v0.4s, v6.4s, v0.4s
+; CHECK-SD-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_sdot_v25i8_double_nomla:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: sxtb w8, w0
+; CHECK-GI-NEXT: sxtb w9, w4
+; CHECK-GI-NEXT: ldr w10, [sp, #48]
+; CHECK-GI-NEXT: sxtb w11, w5
+; CHECK-GI-NEXT: sxtb w12, w6
+; CHECK-GI-NEXT: sxtb w13, w7
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #16]
+; CHECK-GI-NEXT: mov v1.s[0], w9
+; CHECK-GI-NEXT: sxtb w9, w1
+; CHECK-GI-NEXT: ldr w14, [sp, #104]
+; CHECK-GI-NEXT: mov v20.s[0], wzr
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v0.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #80]
+; CHECK-GI-NEXT: mov v2.s[0], w8
+; CHECK-GI-NEXT: sxtb w8, w10
+; CHECK-GI-NEXT: mov v1.s[1], w11
+; CHECK-GI-NEXT: ldr w10, [sp, #24]
+; CHECK-GI-NEXT: sxtb w11, w2
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v20.s[1], wzr
+; CHECK-GI-NEXT: mov v3.s[0], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #56]
+; CHECK-GI-NEXT: mov v4.s[0], w9
+; CHECK-GI-NEXT: sxtb w9, w10
+; CHECK-GI-NEXT: mov v0.s[2], w11
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: ldr w10, [sp, #32]
+; CHECK-GI-NEXT: mov v1.s[2], w12
+; CHECK-GI-NEXT: sxtb w12, w3
+; CHECK-GI-NEXT: mov v2.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #112]
+; CHECK-GI-NEXT: mov v3.s[1], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #88]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v0.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #64]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: ldr w11, [sp, #40]
+; CHECK-GI-NEXT: mov v1.s[3], w13
+; CHECK-GI-NEXT: mov v2.s[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #120]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v5.s[0], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #96]
+; CHECK-GI-NEXT: mov v4.s[1], w8
+; CHECK-GI-NEXT: ldr w13, [sp, #72]
+; CHECK-GI-NEXT: sxtb w8, w10
+; CHECK-GI-NEXT: mov v3.s[2], w12
+; CHECK-GI-NEXT: ldr w10, [sp, #352]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w12, w13
+; CHECK-GI-NEXT: sxtb w13, w14
+; CHECK-GI-NEXT: mov v20.s[2], wzr
+; CHECK-GI-NEXT: mov v4.s[2], w9
+; CHECK-GI-NEXT: sxtb w9, w10
+; CHECK-GI-NEXT: mov v2.s[3], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #128]
+; CHECK-GI-NEXT: mov v5.s[1], w8
+; CHECK-GI-NEXT: ldr w10, [sp, #384]
+; CHECK-GI-NEXT: mov v3.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #360]
+; CHECK-GI-NEXT: mov v6.s[0], w9
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: ldr w9, [sp, #144]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v4.s[3], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #392]
+; CHECK-GI-NEXT: mov v5.s[2], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #368]
+; CHECK-GI-NEXT: mov v7.s[0], w10
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v6.s[1], w12
+; CHECK-GI-NEXT: ldr w10, [sp, #416]
+; CHECK-GI-NEXT: sxtb w12, w13
+; CHECK-GI-NEXT: ldr w13, [sp, #448]
+; CHECK-GI-NEXT: ldr w8, [sp, #136]
+; CHECK-GI-NEXT: mov v16.s[0], w9
+; CHECK-GI-NEXT: sxtb w9, w11
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w11, [sp, #400]
+; CHECK-GI-NEXT: mov v7.s[1], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #480]
+; CHECK-GI-NEXT: mov v6.s[2], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #424]
+; CHECK-GI-NEXT: mov v17.s[0], w10
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr w10, [sp, #456]
+; CHECK-GI-NEXT: mov v16.s[1], wzr
+; CHECK-GI-NEXT: mov v18.s[0], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #488]
+; CHECK-GI-NEXT: mov v7.s[2], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #512]
+; CHECK-GI-NEXT: mov v19.s[0], w12
+; CHECK-GI-NEXT: mov v17.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #544]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w14, w8
+; CHECK-GI-NEXT: ldr w8, [sp, #376]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v18.s[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #432]
+; CHECK-GI-NEXT: mov v19.s[1], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #520]
+; CHECK-GI-NEXT: mov v21.s[0], w11
+; CHECK-GI-NEXT: mov v22.s[0], w9
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w11, [sp, #464]
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: ldr w9, [sp, #496]
+; CHECK-GI-NEXT: ldr w12, [sp, #408]
+; CHECK-GI-NEXT: mov v17.s[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #528]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v21.s[1], w13
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr w13, [sp, #440]
+; CHECK-GI-NEXT: mov v22.s[1], wzr
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v18.s[2], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #472]
+; CHECK-GI-NEXT: mov v19.s[2], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #504]
+; CHECK-GI-NEXT: mov v16.s[2], wzr
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v21.s[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #536]
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v22.s[2], wzr
+; CHECK-GI-NEXT: mov v6.s[3], w8
+; CHECK-GI-NEXT: sxtb w8, w11
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v5.s[3], w14
+; CHECK-GI-NEXT: mov v7.s[3], w12
+; CHECK-GI-NEXT: mov v17.s[3], w13
+; CHECK-GI-NEXT: mov v18.s[3], w8
+; CHECK-GI-NEXT: mov v16.s[3], wzr
+; CHECK-GI-NEXT: mov v20.s[3], wzr
+; CHECK-GI-NEXT: mov v19.s[3], w9
+; CHECK-GI-NEXT: mov v21.s[3], w10
+; CHECK-GI-NEXT: mov v22.s[3], wzr
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: add v3.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT: add v4.4s, v17.4s, v18.4s
+; CHECK-GI-NEXT: add v6.4s, v16.4s, v20.4s
+; CHECK-GI-NEXT: add v5.4s, v19.4s, v21.4s
+; CHECK-GI-NEXT: add v7.4s, v22.4s, v20.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v6.4s
+; CHECK-GI-NEXT: add v2.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT: add v3.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: addv s1, v1.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ret
+entry:
+ %az = sext <25 x i8> %a to <25 x i32>
+ %r1 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %az)
+ %cz = sext <25 x i8> %c to <25 x i32>
+ %r2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %cz)
+ %x = add i32 %r1, %r2
+ ret i32 %x
+}
+
+define i32 @test_udot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_udot_v32i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: ldp q1, q3, [x0]
+; CHECK-SD-NEXT: ldp q2, q4, [x1]
+; CHECK-SD-NEXT: udot v0.4s, v4.16b, v3.16b
+; CHECK-SD-NEXT: udot v0.4s, v2.16b, v1.16b
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w8, s0
+; CHECK-SD-NEXT: add w0, w8, w2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_udot_v32i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT: ldp q2, q3, [x0]
+; CHECK-GI-NEXT: ldp q4, q5, [x1]
+; CHECK-GI-NEXT: udot v1.4s, v4.16b, v2.16b
+; CHECK-GI-NEXT: udot v0.4s, v5.16b, v3.16b
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ret
+entry:
+ %0 = load <32 x i8>, ptr %a
+ %1 = zext <32 x i8> %0 to <32 x i32>
+ %2 = load <32 x i8>, ptr %b
+ %3 = zext <32 x i8> %2 to <32 x i32>
+ %4 = mul nuw nsw <32 x i32> %3, %1
+ %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
+ %op.extra = add i32 %5, %sum
+ ret i32 %op.extra
+}
+
+define i32 @test_udot_v32i8_nomla(ptr nocapture readonly %a1) {
+; CHECK-SD-LABEL: test_udot_v32i8_nomla:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v0.16b, #1
+; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT: ldp q2, q3, [x0]
+; CHECK-SD-NEXT: udot v1.4s, v3.16b, v0.16b
+; CHECK-SD-NEXT: udot v1.4s, v2.16b, v0.16b
+; CHECK-SD-NEXT: addv s0, v1.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_udot_v32i8_nomla:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: movi v0.16b, #1
+; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT: ldp q3, q4, [x0]
+; CHECK-GI-NEXT: udot v2.4s, v3.16b, v0.16b
+; CHECK-GI-NEXT: udot v1.4s, v4.16b, v0.16b
+; CHECK-GI-NEXT: add v0.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
+entry:
+ %0 = load <32 x i8>, ptr %a1
+ %1 = zext <32 x i8> %0 to <32 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
+ ret i32 %2
+}
+define i32 @test_sdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_sdot_v32i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-NEXT: ldp q1, q3, [x0]
; CHECK-SD-NEXT: ldp q2, q4, [x1]
; CHECK-SD-NEXT: sdot v0.4s, v4.16b, v3.16b
@@ -2881,529 +3891,1429 @@ define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %
; CHECK-GI-NEXT: add w0, w8, w2
; CHECK-GI-NEXT: ret
entry:
- %0 = load <32 x i8>, ptr %a
- %1 = zext <32 x i8> %0 to <32 x i32>
- %2 = load <32 x i8>, ptr %b
- %3 = sext <32 x i8> %2 to <32 x i32>
- %4 = mul nsw <32 x i32> %3, %1
- %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
+ %0 = load <32 x i8>, ptr %a
+ %1 = zext <32 x i8> %0 to <32 x i32>
+ %2 = load <32 x i8>, ptr %b
+ %3 = sext <32 x i8> %2 to <32 x i32>
+ %4 = mul nsw <32 x i32> %3, %1
+ %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
+ %op.extra = add nsw i32 %5, %sum
+ ret i32 %op.extra
+}
+
+define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_v32i8_double:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
+; CHECK-SD-NEXT: usdot v16.4s, v1.16b, v3.16b
+; CHECK-SD-NEXT: usdot v18.4s, v0.16b, v2.16b
+; CHECK-SD-NEXT: usdot v17.4s, v4.16b, v6.16b
+; CHECK-SD-NEXT: usdot v19.4s, v5.16b, v7.16b
+; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s
+; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_v32i8_double:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset b8, -8
+; CHECK-GI-NEXT: .cfi_offset b9, -16
+; CHECK-GI-NEXT: .cfi_offset b10, -24
+; CHECK-GI-NEXT: .cfi_offset b11, -32
+; CHECK-GI-NEXT: .cfi_offset b12, -40
+; CHECK-GI-NEXT: .cfi_offset b13, -48
+; CHECK-GI-NEXT: .cfi_offset b14, -56
+; CHECK-GI-NEXT: .cfi_offset b15, -64
+; CHECK-GI-NEXT: ushll v16.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: ushll v17.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: sshll v18.8h, v2.8b, #0
+; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0
+; CHECK-GI-NEXT: sshll v19.8h, v3.8b, #0
+; CHECK-GI-NEXT: sshll2 v3.8h, v3.16b, #0
+; CHECK-GI-NEXT: ushll v27.8h, v4.8b, #0
+; CHECK-GI-NEXT: ushll2 v4.8h, v4.16b, #0
+; CHECK-GI-NEXT: ushll v28.8h, v5.8b, #0
+; CHECK-GI-NEXT: sshll v29.8h, v6.8b, #0
+; CHECK-GI-NEXT: sshll2 v6.8h, v6.16b, #0
+; CHECK-GI-NEXT: ushll2 v5.8h, v5.16b, #0
+; CHECK-GI-NEXT: sshll v30.8h, v7.8b, #0
+; CHECK-GI-NEXT: sshll2 v7.8h, v7.16b, #0
+; CHECK-GI-NEXT: ushll2 v20.4s, v16.8h, #0
+; CHECK-GI-NEXT: ushll2 v21.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll2 v22.4s, v17.8h, #0
+; CHECK-GI-NEXT: ushll2 v23.4s, v1.8h, #0
+; CHECK-GI-NEXT: sshll2 v24.4s, v18.8h, #0
+; CHECK-GI-NEXT: sshll2 v25.4s, v2.8h, #0
+; CHECK-GI-NEXT: sshll2 v26.4s, v19.8h, #0
+; CHECK-GI-NEXT: sshll2 v31.4s, v3.8h, #0
+; CHECK-GI-NEXT: ushll2 v8.4s, v27.8h, #0
+; CHECK-GI-NEXT: ushll2 v9.4s, v4.8h, #0
+; CHECK-GI-NEXT: ushll2 v10.4s, v28.8h, #0
+; CHECK-GI-NEXT: sshll2 v11.4s, v29.8h, #0
+; CHECK-GI-NEXT: sshll2 v12.4s, v6.8h, #0
+; CHECK-GI-NEXT: ushll2 v13.4s, v5.8h, #0
+; CHECK-GI-NEXT: sshll2 v14.4s, v30.8h, #0
+; CHECK-GI-NEXT: sshll2 v15.4s, v7.8h, #0
+; CHECK-GI-NEXT: mul v20.4s, v20.4s, v24.4s
+; CHECK-GI-NEXT: mul v21.4s, v21.4s, v25.4s
+; CHECK-GI-NEXT: mul v22.4s, v22.4s, v26.4s
+; CHECK-GI-NEXT: mul v23.4s, v23.4s, v31.4s
+; CHECK-GI-NEXT: mul v24.4s, v8.4s, v11.4s
+; CHECK-GI-NEXT: mul v25.4s, v9.4s, v12.4s
+; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mul v26.4s, v10.4s, v14.4s
+; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mul v31.4s, v13.4s, v15.4s
+; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v17.4s, v17.4h, #0
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: sshll v18.4s, v18.4h, #0
+; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: sshll v19.4s, v19.4h, #0
+; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: ushll v27.4s, v27.4h, #0
+; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT: ushll v28.4s, v28.4h, #0
+; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0
+; CHECK-GI-NEXT: sshll v29.4s, v29.4h, #0
+; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0
+; CHECK-GI-NEXT: sshll v30.4s, v30.4h, #0
+; CHECK-GI-NEXT: sshll v7.4s, v7.4h, #0
+; CHECK-GI-NEXT: mla v20.4s, v16.4s, v18.4s
+; CHECK-GI-NEXT: mla v21.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT: mla v22.4s, v17.4s, v19.4s
+; CHECK-GI-NEXT: mla v23.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT: mla v24.4s, v27.4s, v29.4s
+; CHECK-GI-NEXT: mla v25.4s, v4.4s, v6.4s
+; CHECK-GI-NEXT: mla v26.4s, v28.4s, v30.4s
+; CHECK-GI-NEXT: mla v31.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT: add v0.4s, v20.4s, v21.4s
+; CHECK-GI-NEXT: add v1.4s, v22.4s, v23.4s
+; CHECK-GI-NEXT: add v2.4s, v24.4s, v25.4s
+; CHECK-GI-NEXT: add v3.4s, v26.4s, v31.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: addv s1, v1.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ret
+entry:
+ %az = zext <32 x i8> %a to <32 x i32>
+ %bz = sext <32 x i8> %b to <32 x i32>
+ %m1 = mul nuw nsw <32 x i32> %az, %bz
+ %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m1)
+ %cz = zext <32 x i8> %c to <32 x i32>
+ %dz = sext <32 x i8> %d to <32 x i32>
+ %m2 = mul nuw nsw <32 x i32> %cz, %dz
+ %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m2)
+ %x = add i32 %r1, %r2
+ ret i32 %x
+}
+
+
+define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_udot_v33i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldr b0, [x0, #32]
+; CHECK-SD-NEXT: ldr b1, [x1, #32]
+; CHECK-SD-NEXT: movi v7.2d, #0000000000000000
+; CHECK-SD-NEXT: ldp q3, q4, [x1]
+; CHECK-SD-NEXT: umull v0.8h, v1.8b, v0.8b
+; CHECK-SD-NEXT: ldp q1, q2, [x0]
+; CHECK-SD-NEXT: umull v5.8h, v4.8b, v2.8b
+; CHECK-SD-NEXT: umull v6.8h, v3.8b, v1.8b
+; CHECK-SD-NEXT: umull2 v2.8h, v4.16b, v2.16b
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: umull2 v1.8h, v3.16b, v1.16b
+; CHECK-SD-NEXT: mov v7.s[0], v0.s[0]
+; CHECK-SD-NEXT: uaddl2 v3.4s, v6.8h, v5.8h
+; CHECK-SD-NEXT: uaddl2 v0.4s, v1.8h, v2.8h
+; CHECK-SD-NEXT: uaddl v1.4s, v1.4h, v2.4h
+; CHECK-SD-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-SD-NEXT: uaddw v2.4s, v7.4s, v6.4h
+; CHECK-SD-NEXT: uaddw v2.4s, v2.4s, v5.4h
+; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: add v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w8, s0
+; CHECK-SD-NEXT: add w0, w8, w2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_udot_v33i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldp q19, q4, [x1]
+; CHECK-GI-NEXT: mov v25.s[0], wzr
+; CHECK-GI-NEXT: ldp q7, q5, [x0]
+; CHECK-GI-NEXT: umov w8, v19.b[0]
+; CHECK-GI-NEXT: umov w9, v19.b[4]
+; CHECK-GI-NEXT: umov w10, v19.b[8]
+; CHECK-GI-NEXT: umov w11, v19.b[1]
+; CHECK-GI-NEXT: umov w12, v19.b[6]
+; CHECK-GI-NEXT: umov w13, v19.b[12]
+; CHECK-GI-NEXT: umov w14, v4.b[0]
+; CHECK-GI-NEXT: umov w15, v4.b[4]
+; CHECK-GI-NEXT: umov w16, v4.b[12]
+; CHECK-GI-NEXT: mov v25.s[1], wzr
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: umov w8, v19.b[5]
+; CHECK-GI-NEXT: mov v2.s[0], w9
+; CHECK-GI-NEXT: umov w9, v19.b[9]
+; CHECK-GI-NEXT: mov v1.s[0], w10
+; CHECK-GI-NEXT: umov w10, v19.b[2]
+; CHECK-GI-NEXT: mov v6.s[0], w13
+; CHECK-GI-NEXT: umov w13, v19.b[3]
+; CHECK-GI-NEXT: mov v3.s[0], w14
+; CHECK-GI-NEXT: umov w14, v19.b[13]
+; CHECK-GI-NEXT: mov v16.s[0], w15
+; CHECK-GI-NEXT: umov w15, v4.b[8]
+; CHECK-GI-NEXT: mov v0.s[1], w11
+; CHECK-GI-NEXT: umov w11, v19.b[10]
+; CHECK-GI-NEXT: mov v2.s[1], w8
+; CHECK-GI-NEXT: ldrb w8, [x0, #32]
+; CHECK-GI-NEXT: mov v1.s[1], w9
+; CHECK-GI-NEXT: ldrb w9, [x1, #32]
+; CHECK-GI-NEXT: mov v17.s[0], w16
+; CHECK-GI-NEXT: umov w16, v19.b[14]
+; CHECK-GI-NEXT: mov v25.s[2], wzr
+; CHECK-GI-NEXT: mul w8, w9, w8
+; CHECK-GI-NEXT: mov v6.s[1], w14
+; CHECK-GI-NEXT: umov w14, v4.b[1]
+; CHECK-GI-NEXT: mov v0.s[2], w10
+; CHECK-GI-NEXT: umov w10, v19.b[7]
+; CHECK-GI-NEXT: mov v2.s[2], w12
+; CHECK-GI-NEXT: umov w12, v19.b[11]
+; CHECK-GI-NEXT: mov v1.s[2], w11
+; CHECK-GI-NEXT: umov w11, v4.b[5]
+; CHECK-GI-NEXT: mov v18.s[0], w15
+; CHECK-GI-NEXT: umov w15, v19.b[15]
+; CHECK-GI-NEXT: umov w9, v5.b[2]
+; CHECK-GI-NEXT: mov v6.s[2], w16
+; CHECK-GI-NEXT: umov w16, v7.b[0]
+; CHECK-GI-NEXT: mov v3.s[1], w14
+; CHECK-GI-NEXT: mov v0.s[3], w13
+; CHECK-GI-NEXT: umov w13, v7.b[4]
+; CHECK-GI-NEXT: mov v2.s[3], w10
+; CHECK-GI-NEXT: umov w10, v4.b[6]
+; CHECK-GI-NEXT: mov v1.s[3], w12
+; CHECK-GI-NEXT: umov w12, v4.b[13]
+; CHECK-GI-NEXT: mov v16.s[1], w11
+; CHECK-GI-NEXT: umov w11, v4.b[9]
+; CHECK-GI-NEXT: umov w14, v7.b[5]
+; CHECK-GI-NEXT: mov v19.s[0], w16
+; CHECK-GI-NEXT: umov w16, v7.b[1]
+; CHECK-GI-NEXT: mov v6.s[3], w15
+; CHECK-GI-NEXT: mov v20.s[0], w13
+; CHECK-GI-NEXT: umov w13, v4.b[2]
+; CHECK-GI-NEXT: umov w15, v7.b[6]
+; CHECK-GI-NEXT: mov v17.s[1], w12
+; CHECK-GI-NEXT: umov w12, v4.b[14]
+; CHECK-GI-NEXT: mov v27.s[0], w8
+; CHECK-GI-NEXT: mov v16.s[2], w10
+; CHECK-GI-NEXT: umov w10, v4.b[7]
+; CHECK-GI-NEXT: mov v18.s[1], w11
+; CHECK-GI-NEXT: umov w11, v4.b[10]
+; CHECK-GI-NEXT: mov v19.s[1], w16
+; CHECK-GI-NEXT: umov w16, v5.b[4]
+; CHECK-GI-NEXT: mov v20.s[1], w14
+; CHECK-GI-NEXT: umov w14, v4.b[15]
+; CHECK-GI-NEXT: mov v3.s[2], w13
+; CHECK-GI-NEXT: mov v17.s[2], w12
+; CHECK-GI-NEXT: umov w12, v7.b[12]
+; CHECK-GI-NEXT: umov w13, v7.b[7]
+; CHECK-GI-NEXT: mov v16.s[3], w10
+; CHECK-GI-NEXT: umov w10, v7.b[8]
+; CHECK-GI-NEXT: umov w8, v7.b[3]
+; CHECK-GI-NEXT: mov v18.s[2], w11
+; CHECK-GI-NEXT: umov w11, v7.b[2]
+; CHECK-GI-NEXT: mov v23.s[0], w16
+; CHECK-GI-NEXT: mov v20.s[2], w15
+; CHECK-GI-NEXT: umov w15, v5.b[12]
+; CHECK-GI-NEXT: umov w16, v7.b[14]
+; CHECK-GI-NEXT: mov v17.s[3], w14
+; CHECK-GI-NEXT: umov w14, v7.b[13]
+; CHECK-GI-NEXT: mov v22.s[0], w12
+; CHECK-GI-NEXT: umov w12, v7.b[9]
+; CHECK-GI-NEXT: mov v21.s[0], w10
+; CHECK-GI-NEXT: umov w10, v4.b[3]
+; CHECK-GI-NEXT: mov v19.s[2], w11
+; CHECK-GI-NEXT: umov w11, v5.b[0]
+; CHECK-GI-NEXT: mov v27.s[1], wzr
+; CHECK-GI-NEXT: mov v20.s[3], w13
+; CHECK-GI-NEXT: umov w13, v5.b[5]
+; CHECK-GI-NEXT: mov v24.s[0], w15
+; CHECK-GI-NEXT: mov v22.s[1], w14
+; CHECK-GI-NEXT: umov w14, v5.b[8]
+; CHECK-GI-NEXT: umov w15, v4.b[11]
+; CHECK-GI-NEXT: mov v21.s[1], w12
+; CHECK-GI-NEXT: umov w12, v5.b[13]
+; CHECK-GI-NEXT: mov v25.s[3], wzr
+; CHECK-GI-NEXT: mov v4.s[0], w11
+; CHECK-GI-NEXT: umov w11, v5.b[1]
+; CHECK-GI-NEXT: mov v3.s[3], w10
+; CHECK-GI-NEXT: mov v23.s[1], w13
+; CHECK-GI-NEXT: umov w13, v5.b[6]
+; CHECK-GI-NEXT: mov v19.s[3], w8
+; CHECK-GI-NEXT: mov v22.s[2], w16
+; CHECK-GI-NEXT: umov w16, v5.b[9]
+; CHECK-GI-NEXT: mov v26.s[0], w14
+; CHECK-GI-NEXT: mov v24.s[1], w12
+; CHECK-GI-NEXT: umov w12, v5.b[14]
+; CHECK-GI-NEXT: umov w14, v7.b[10]
+; CHECK-GI-NEXT: mov v4.s[1], w11
+; CHECK-GI-NEXT: umov w11, v7.b[15]
+; CHECK-GI-NEXT: mov v18.s[3], w15
+; CHECK-GI-NEXT: mov v23.s[2], w13
+; CHECK-GI-NEXT: umov w13, v5.b[7]
+; CHECK-GI-NEXT: mul v2.4s, v2.4s, v20.4s
+; CHECK-GI-NEXT: mov v26.s[1], w16
+; CHECK-GI-NEXT: umov w16, v5.b[10]
+; CHECK-GI-NEXT: mov v27.s[2], wzr
+; CHECK-GI-NEXT: mov v24.s[2], w12
+; CHECK-GI-NEXT: umov w12, v5.b[15]
+; CHECK-GI-NEXT: mov v21.s[2], w14
+; CHECK-GI-NEXT: umov w14, v7.b[11]
+; CHECK-GI-NEXT: mov v4.s[2], w9
+; CHECK-GI-NEXT: umov w9, v5.b[3]
+; CHECK-GI-NEXT: mov v22.s[3], w11
+; CHECK-GI-NEXT: umov w11, v5.b[11]
+; CHECK-GI-NEXT: mov v23.s[3], w13
+; CHECK-GI-NEXT: mov v26.s[2], w16
+; CHECK-GI-NEXT: mla v2.4s, v0.4s, v19.4s
+; CHECK-GI-NEXT: mov v27.s[3], wzr
+; CHECK-GI-NEXT: mov v24.s[3], w12
+; CHECK-GI-NEXT: mov v21.s[3], w14
+; CHECK-GI-NEXT: mov v4.s[3], w9
+; CHECK-GI-NEXT: mul v5.4s, v6.4s, v22.4s
+; CHECK-GI-NEXT: mul v6.4s, v16.4s, v23.4s
+; CHECK-GI-NEXT: add v16.4s, v25.4s, v25.4s
+; CHECK-GI-NEXT: mov v26.s[3], w11
+; CHECK-GI-NEXT: mul v7.4s, v17.4s, v24.4s
+; CHECK-GI-NEXT: add v0.4s, v25.4s, v16.4s
+; CHECK-GI-NEXT: mla v5.4s, v1.4s, v21.4s
+; CHECK-GI-NEXT: mla v6.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT: add v3.4s, v16.4s, v16.4s
+; CHECK-GI-NEXT: mla v7.4s, v18.4s, v26.4s
+; CHECK-GI-NEXT: add v0.4s, v27.4s, v0.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v5.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v3.4s
+; CHECK-GI-NEXT: add v2.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ret
+entry:
+ %0 = load <33 x i8>, ptr %a
+ %1 = zext <33 x i8> %0 to <33 x i32>
+ %2 = load <33 x i8>, ptr %b
+ %3 = zext <33 x i8> %2 to <33 x i32>
+ %4 = mul nuw nsw <33 x i32> %3, %1
+ %5 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %4)
+ %op.extra = add i32 %5, %sum
+ ret i32 %op.extra
+}
+
+define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) {
+; CHECK-SD-LABEL: test_udot_v33i8_nomla:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldr b1, [x0, #32]
+; CHECK-SD-NEXT: ldp q3, q2, [x0]
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: ushll v4.8h, v2.8b, #0
+; CHECK-SD-NEXT: ushll v5.8h, v3.8b, #0
+; CHECK-SD-NEXT: ushll2 v2.8h, v2.16b, #0
+; CHECK-SD-NEXT: ushll2 v3.8h, v3.16b, #0
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT: uaddl2 v6.4s, v5.8h, v4.8h
+; CHECK-SD-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-SD-NEXT: uaddl2 v1.4s, v3.8h, v2.8h
+; CHECK-SD-NEXT: uaddl v2.4s, v3.4h, v2.4h
+; CHECK-SD-NEXT: add v1.4s, v6.4s, v1.4s
+; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v5.4h
+; CHECK-SD-NEXT: add v1.4s, v2.4s, v1.4s
+; CHECK-SD-NEXT: uaddw v0.4s, v0.4s, v4.4h
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_udot_v33i8_nomla:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: str x27, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-GI-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT: .cfi_offset w19, -8
+; CHECK-GI-NEXT: .cfi_offset w20, -16
+; CHECK-GI-NEXT: .cfi_offset w21, -24
+; CHECK-GI-NEXT: .cfi_offset w22, -32
+; CHECK-GI-NEXT: .cfi_offset w23, -40
+; CHECK-GI-NEXT: .cfi_offset w24, -48
+; CHECK-GI-NEXT: .cfi_offset w25, -56
+; CHECK-GI-NEXT: .cfi_offset w26, -64
+; CHECK-GI-NEXT: .cfi_offset w27, -80
+; CHECK-GI-NEXT: ldp q2, q1, [x0]
+; CHECK-GI-NEXT: mov v0.s[0], wzr
+; CHECK-GI-NEXT: ldrb w2, [x0, #32]
+; CHECK-GI-NEXT: umov w20, v2.b[0]
+; CHECK-GI-NEXT: umov w21, v2.b[4]
+; CHECK-GI-NEXT: umov w22, v2.b[8]
+; CHECK-GI-NEXT: umov w23, v2.b[12]
+; CHECK-GI-NEXT: umov w24, v1.b[0]
+; CHECK-GI-NEXT: umov w25, v1.b[4]
+; CHECK-GI-NEXT: umov w26, v1.b[8]
+; CHECK-GI-NEXT: umov w27, v1.b[12]
+; CHECK-GI-NEXT: umov w0, v2.b[1]
+; CHECK-GI-NEXT: umov w12, v2.b[2]
+; CHECK-GI-NEXT: umov w8, v2.b[3]
+; CHECK-GI-NEXT: umov w3, v2.b[5]
+; CHECK-GI-NEXT: umov w14, v2.b[6]
+; CHECK-GI-NEXT: umov w9, v2.b[7]
+; CHECK-GI-NEXT: umov w4, v2.b[9]
+; CHECK-GI-NEXT: umov w15, v2.b[10]
+; CHECK-GI-NEXT: umov w10, v2.b[11]
+; CHECK-GI-NEXT: umov w5, v2.b[13]
+; CHECK-GI-NEXT: umov w16, v2.b[14]
+; CHECK-GI-NEXT: umov w11, v2.b[15]
+; CHECK-GI-NEXT: umov w6, v1.b[1]
+; CHECK-GI-NEXT: umov w7, v1.b[5]
+; CHECK-GI-NEXT: umov w19, v1.b[9]
+; CHECK-GI-NEXT: mov v2.s[0], w20
+; CHECK-GI-NEXT: mov v3.s[0], w21
+; CHECK-GI-NEXT: mov v4.s[0], w22
+; CHECK-GI-NEXT: mov v5.s[0], w23
+; CHECK-GI-NEXT: mov v6.s[0], w24
+; CHECK-GI-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v7.s[0], w25
+; CHECK-GI-NEXT: mov v16.s[0], w26
+; CHECK-GI-NEXT: umov w20, v1.b[13]
+; CHECK-GI-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v17.s[0], w27
+; CHECK-GI-NEXT: mov v18.s[0], w2
+; CHECK-GI-NEXT: umov w17, v1.b[2]
+; CHECK-GI-NEXT: umov w1, v1.b[6]
+; CHECK-GI-NEXT: umov w2, v1.b[10]
+; CHECK-GI-NEXT: umov w21, v1.b[14]
+; CHECK-GI-NEXT: mov v2.s[1], w0
+; CHECK-GI-NEXT: mov v3.s[1], w3
+; CHECK-GI-NEXT: mov v4.s[1], w4
+; CHECK-GI-NEXT: mov v5.s[1], w5
+; CHECK-GI-NEXT: mov v6.s[1], w6
+; CHECK-GI-NEXT: mov v7.s[1], w7
+; CHECK-GI-NEXT: mov v16.s[1], w19
+; CHECK-GI-NEXT: mov v17.s[1], w20
+; CHECK-GI-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v18.s[1], wzr
+; CHECK-GI-NEXT: mov v0.s[1], wzr
+; CHECK-GI-NEXT: umov w13, v1.b[3]
+; CHECK-GI-NEXT: umov w18, v1.b[7]
+; CHECK-GI-NEXT: umov w0, v1.b[11]
+; CHECK-GI-NEXT: umov w3, v1.b[15]
+; CHECK-GI-NEXT: mov v2.s[2], w12
+; CHECK-GI-NEXT: mov v3.s[2], w14
+; CHECK-GI-NEXT: mov v4.s[2], w15
+; CHECK-GI-NEXT: mov v5.s[2], w16
+; CHECK-GI-NEXT: mov v6.s[2], w17
+; CHECK-GI-NEXT: mov v7.s[2], w1
+; CHECK-GI-NEXT: mov v16.s[2], w2
+; CHECK-GI-NEXT: mov v17.s[2], w21
+; CHECK-GI-NEXT: mov v18.s[2], wzr
+; CHECK-GI-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.s[2], wzr
+; CHECK-GI-NEXT: mov v2.s[3], w8
+; CHECK-GI-NEXT: mov v3.s[3], w9
+; CHECK-GI-NEXT: mov v4.s[3], w10
+; CHECK-GI-NEXT: mov v5.s[3], w11
+; CHECK-GI-NEXT: mov v6.s[3], w13
+; CHECK-GI-NEXT: mov v7.s[3], w18
+; CHECK-GI-NEXT: mov v16.s[3], w0
+; CHECK-GI-NEXT: mov v17.s[3], w3
+; CHECK-GI-NEXT: mov v18.s[3], wzr
+; CHECK-GI-NEXT: mov v0.s[3], wzr
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: add v3.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT: add v4.4s, v16.4s, v17.4s
+; CHECK-GI-NEXT: add v5.4s, v18.4s, v0.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: add v2.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT: add v3.4s, v5.4s, v0.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ldr x27, [sp], #80 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ret
+entry:
+ %0 = load <33 x i8>, ptr %a1
+ %1 = zext <33 x i8> %0 to <33 x i32>
+ %2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %1)
+ ret i32 %2
+}
+define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_sdot_v33i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldr b0, [x0, #32]
+; CHECK-SD-NEXT: ldr b1, [x1, #32]
+; CHECK-SD-NEXT: movi v7.2d, #0000000000000000
+; CHECK-SD-NEXT: ldp q3, q4, [x1]
+; CHECK-SD-NEXT: smull v0.8h, v1.8b, v0.8b
+; CHECK-SD-NEXT: ldp q1, q2, [x0]
+; CHECK-SD-NEXT: smull v5.8h, v4.8b, v2.8b
+; CHECK-SD-NEXT: smull v6.8h, v3.8b, v1.8b
+; CHECK-SD-NEXT: smull2 v2.8h, v4.16b, v2.16b
+; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: smull2 v1.8h, v3.16b, v1.16b
+; CHECK-SD-NEXT: mov v7.s[0], v0.s[0]
+; CHECK-SD-NEXT: saddl2 v3.4s, v6.8h, v5.8h
+; CHECK-SD-NEXT: saddl2 v0.4s, v1.8h, v2.8h
+; CHECK-SD-NEXT: saddl v1.4s, v1.4h, v2.4h
+; CHECK-SD-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-SD-NEXT: saddw v2.4s, v7.4s, v6.4h
+; CHECK-SD-NEXT: saddw v2.4s, v2.4s, v5.4h
+; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: add v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w8, s0
+; CHECK-SD-NEXT: add w0, w8, w2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_sdot_v33i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldp q19, q4, [x1]
+; CHECK-GI-NEXT: mov v25.s[0], wzr
+; CHECK-GI-NEXT: ldp q7, q5, [x0]
+; CHECK-GI-NEXT: smov w8, v19.b[0]
+; CHECK-GI-NEXT: smov w9, v19.b[4]
+; CHECK-GI-NEXT: smov w10, v19.b[8]
+; CHECK-GI-NEXT: smov w11, v19.b[1]
+; CHECK-GI-NEXT: smov w12, v19.b[6]
+; CHECK-GI-NEXT: smov w13, v19.b[12]
+; CHECK-GI-NEXT: smov w14, v4.b[0]
+; CHECK-GI-NEXT: smov w15, v4.b[4]
+; CHECK-GI-NEXT: smov w16, v4.b[12]
+; CHECK-GI-NEXT: mov v25.s[1], wzr
+; CHECK-GI-NEXT: mov v0.s[0], w8
+; CHECK-GI-NEXT: smov w8, v19.b[5]
+; CHECK-GI-NEXT: mov v2.s[0], w9
+; CHECK-GI-NEXT: smov w9, v19.b[9]
+; CHECK-GI-NEXT: mov v1.s[0], w10
+; CHECK-GI-NEXT: smov w10, v19.b[2]
+; CHECK-GI-NEXT: mov v6.s[0], w13
+; CHECK-GI-NEXT: smov w13, v19.b[3]
+; CHECK-GI-NEXT: mov v3.s[0], w14
+; CHECK-GI-NEXT: smov w14, v19.b[13]
+; CHECK-GI-NEXT: mov v16.s[0], w15
+; CHECK-GI-NEXT: smov w15, v4.b[8]
+; CHECK-GI-NEXT: mov v0.s[1], w11
+; CHECK-GI-NEXT: smov w11, v19.b[10]
+; CHECK-GI-NEXT: mov v2.s[1], w8
+; CHECK-GI-NEXT: ldrsb w8, [x0, #32]
+; CHECK-GI-NEXT: mov v1.s[1], w9
+; CHECK-GI-NEXT: ldrsb w9, [x1, #32]
+; CHECK-GI-NEXT: mov v17.s[0], w16
+; CHECK-GI-NEXT: smov w16, v19.b[14]
+; CHECK-GI-NEXT: mov v25.s[2], wzr
+; CHECK-GI-NEXT: mul w8, w9, w8
+; CHECK-GI-NEXT: mov v6.s[1], w14
+; CHECK-GI-NEXT: smov w14, v4.b[1]
+; CHECK-GI-NEXT: mov v0.s[2], w10
+; CHECK-GI-NEXT: smov w10, v19.b[7]
+; CHECK-GI-NEXT: mov v2.s[2], w12
+; CHECK-GI-NEXT: smov w12, v19.b[11]
+; CHECK-GI-NEXT: mov v1.s[2], w11
+; CHECK-GI-NEXT: smov w11, v4.b[5]
+; CHECK-GI-NEXT: mov v18.s[0], w15
+; CHECK-GI-NEXT: smov w15, v19.b[15]
+; CHECK-GI-NEXT: smov w9, v5.b[2]
+; CHECK-GI-NEXT: mov v6.s[2], w16
+; CHECK-GI-NEXT: smov w16, v7.b[0]
+; CHECK-GI-NEXT: mov v3.s[1], w14
+; CHECK-GI-NEXT: mov v0.s[3], w13
+; CHECK-GI-NEXT: smov w13, v7.b[4]
+; CHECK-GI-NEXT: mov v2.s[3], w10
+; CHECK-GI-NEXT: smov w10, v4.b[6]
+; CHECK-GI-NEXT: mov v1.s[3], w12
+; CHECK-GI-NEXT: smov w12, v4.b[13]
+; CHECK-GI-NEXT: mov v16.s[1], w11
+; CHECK-GI-NEXT: smov w11, v4.b[9]
+; CHECK-GI-NEXT: smov w14, v7.b[5]
+; CHECK-GI-NEXT: mov v19.s[0], w16
+; CHECK-GI-NEXT: smov w16, v7.b[1]
+; CHECK-GI-NEXT: mov v6.s[3], w15
+; CHECK-GI-NEXT: mov v20.s[0], w13
+; CHECK-GI-NEXT: smov w13, v4.b[2]
+; CHECK-GI-NEXT: smov w15, v7.b[6]
+; CHECK-GI-NEXT: mov v17.s[1], w12
+; CHECK-GI-NEXT: smov w12, v4.b[14]
+; CHECK-GI-NEXT: mov v27.s[0], w8
+; CHECK-GI-NEXT: mov v16.s[2], w10
+; CHECK-GI-NEXT: smov w10, v4.b[7]
+; CHECK-GI-NEXT: mov v18.s[1], w11
+; CHECK-GI-NEXT: smov w11, v4.b[10]
+; CHECK-GI-NEXT: mov v19.s[1], w16
+; CHECK-GI-NEXT: smov w16, v5.b[4]
+; CHECK-GI-NEXT: mov v20.s[1], w14
+; CHECK-GI-NEXT: smov w14, v4.b[15]
+; CHECK-GI-NEXT: mov v3.s[2], w13
+; CHECK-GI-NEXT: mov v17.s[2], w12
+; CHECK-GI-NEXT: smov w12, v7.b[12]
+; CHECK-GI-NEXT: smov w13, v7.b[7]
+; CHECK-GI-NEXT: mov v16.s[3], w10
+; CHECK-GI-NEXT: smov w10, v7.b[8]
+; CHECK-GI-NEXT: smov w8, v7.b[3]
+; CHECK-GI-NEXT: mov v18.s[2], w11
+; CHECK-GI-NEXT: smov w11, v7.b[2]
+; CHECK-GI-NEXT: mov v23.s[0], w16
+; CHECK-GI-NEXT: mov v20.s[2], w15
+; CHECK-GI-NEXT: smov w15, v5.b[12]
+; CHECK-GI-NEXT: smov w16, v7.b[14]
+; CHECK-GI-NEXT: mov v17.s[3], w14
+; CHECK-GI-NEXT: smov w14, v7.b[13]
+; CHECK-GI-NEXT: mov v22.s[0], w12
+; CHECK-GI-NEXT: smov w12, v7.b[9]
+; CHECK-GI-NEXT: mov v21.s[0], w10
+; CHECK-GI-NEXT: smov w10, v4.b[3]
+; CHECK-GI-NEXT: mov v19.s[2], w11
+; CHECK-GI-NEXT: smov w11, v5.b[0]
+; CHECK-GI-NEXT: mov v27.s[1], wzr
+; CHECK-GI-NEXT: mov v20.s[3], w13
+; CHECK-GI-NEXT: smov w13, v5.b[5]
+; CHECK-GI-NEXT: mov v24.s[0], w15
+; CHECK-GI-NEXT: mov v22.s[1], w14
+; CHECK-GI-NEXT: smov w14, v5.b[8]
+; CHECK-GI-NEXT: smov w15, v4.b[11]
+; CHECK-GI-NEXT: mov v21.s[1], w12
+; CHECK-GI-NEXT: smov w12, v5.b[13]
+; CHECK-GI-NEXT: mov v25.s[3], wzr
+; CHECK-GI-NEXT: mov v4.s[0], w11
+; CHECK-GI-NEXT: smov w11, v5.b[1]
+; CHECK-GI-NEXT: mov v3.s[3], w10
+; CHECK-GI-NEXT: mov v23.s[1], w13
+; CHECK-GI-NEXT: smov w13, v5.b[6]
+; CHECK-GI-NEXT: mov v19.s[3], w8
+; CHECK-GI-NEXT: mov v22.s[2], w16
+; CHECK-GI-NEXT: smov w16, v5.b[9]
+; CHECK-GI-NEXT: mov v26.s[0], w14
+; CHECK-GI-NEXT: mov v24.s[1], w12
+; CHECK-GI-NEXT: smov w12, v5.b[14]
+; CHECK-GI-NEXT: smov w14, v7.b[10]
+; CHECK-GI-NEXT: mov v4.s[1], w11
+; CHECK-GI-NEXT: smov w11, v7.b[15]
+; CHECK-GI-NEXT: mov v18.s[3], w15
+; CHECK-GI-NEXT: mov v23.s[2], w13
+; CHECK-GI-NEXT: smov w13, v5.b[7]
+; CHECK-GI-NEXT: mul v2.4s, v2.4s, v20.4s
+; CHECK-GI-NEXT: mov v26.s[1], w16
+; CHECK-GI-NEXT: smov w16, v5.b[10]
+; CHECK-GI-NEXT: mov v27.s[2], wzr
+; CHECK-GI-NEXT: mov v24.s[2], w12
+; CHECK-GI-NEXT: smov w12, v5.b[15]
+; CHECK-GI-NEXT: mov v21.s[2], w14
+; CHECK-GI-NEXT: smov w14, v7.b[11]
+; CHECK-GI-NEXT: mov v4.s[2], w9
+; CHECK-GI-NEXT: smov w9, v5.b[3]
+; CHECK-GI-NEXT: mov v22.s[3], w11
+; CHECK-GI-NEXT: smov w11, v5.b[11]
+; CHECK-GI-NEXT: mov v23.s[3], w13
+; CHECK-GI-NEXT: mov v26.s[2], w16
+; CHECK-GI-NEXT: mla v2.4s, v0.4s, v19.4s
+; CHECK-GI-NEXT: mov v27.s[3], wzr
+; CHECK-GI-NEXT: mov v24.s[3], w12
+; CHECK-GI-NEXT: mov v21.s[3], w14
+; CHECK-GI-NEXT: mov v4.s[3], w9
+; CHECK-GI-NEXT: mul v5.4s, v6.4s, v22.4s
+; CHECK-GI-NEXT: mul v6.4s, v16.4s, v23.4s
+; CHECK-GI-NEXT: add v16.4s, v25.4s, v25.4s
+; CHECK-GI-NEXT: mov v26.s[3], w11
+; CHECK-GI-NEXT: mul v7.4s, v17.4s, v24.4s
+; CHECK-GI-NEXT: add v0.4s, v25.4s, v16.4s
+; CHECK-GI-NEXT: mla v5.4s, v1.4s, v21.4s
+; CHECK-GI-NEXT: mla v6.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT: add v3.4s, v16.4s, v16.4s
+; CHECK-GI-NEXT: mla v7.4s, v18.4s, v26.4s
+; CHECK-GI-NEXT: add v0.4s, v27.4s, v0.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v5.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v3.4s
+; CHECK-GI-NEXT: add v2.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ret
+entry:
+ %0 = load <33 x i8>, ptr %a
+ %1 = sext <33 x i8> %0 to <33 x i32>
+ %2 = load <33 x i8>, ptr %b
+ %3 = sext <33 x i8> %2 to <33 x i32>
+ %4 = mul nsw <33 x i32> %3, %1
+ %5 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %4)
%op.extra = add nsw i32 %5, %sum
ret i32 %op.extra
}
-define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
-; CHECK-SD-LABEL: test_usdot_v32i8_double:
+define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 x i8> %d) {
+; CHECK-SD-LABEL: test_sdot_v33i8_double:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
+; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: .cfi_offset w29, -16
+; CHECK-SD-NEXT: ldr b0, [sp, #344]
+; CHECK-SD-NEXT: add x8, sp, #352
+; CHECK-SD-NEXT: ldr b1, [sp, #80]
+; CHECK-SD-NEXT: ldr b2, [sp, #216]
+; CHECK-SD-NEXT: add x9, sp, #96
+; CHECK-SD-NEXT: add x10, sp, #104
+; CHECK-SD-NEXT: ld1 { v0.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #88
+; CHECK-SD-NEXT: ldr b4, [sp, #408]
+; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #360
+; CHECK-SD-NEXT: add x12, sp, #248
+; CHECK-SD-NEXT: add x13, sp, #432
+; CHECK-SD-NEXT: add x11, sp, #384
+; CHECK-SD-NEXT: ldr b5, [sp, #144]
+; CHECK-SD-NEXT: ld1 { v0.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #224
+; CHECK-SD-NEXT: ldr b6, [sp, #280]
+; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8]
+; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9]
+; CHECK-SD-NEXT: add x8, sp, #368
+; CHECK-SD-NEXT: add x9, sp, #232
+; CHECK-SD-NEXT: ldr b16, [sp, #744]
+; CHECK-SD-NEXT: ldr b17, [sp, #480]
+; CHECK-SD-NEXT: ld1 { v0.b }[3], [x8]
+; CHECK-SD-NEXT: add x8, sp, #376
+; CHECK-SD-NEXT: ldr b18, [sp, #936]
+; CHECK-SD-NEXT: ld1 { v2.b }[2], [x9]
+; CHECK-SD-NEXT: ld1 { v1.b }[3], [x10]
+; CHECK-SD-NEXT: add x9, sp, #240
+; CHECK-SD-NEXT: add x10, sp, #392
+; CHECK-SD-NEXT: ldr b19, [sp, #672]
+; CHECK-SD-NEXT: ldr b7, [sp, #16]
+; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT: add x8, sp, #112
+; CHECK-SD-NEXT: ldr b21, [sp, #1000]
+; CHECK-SD-NEXT: ld1 { v2.b }[3], [x9]
+; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8]
+; CHECK-SD-NEXT: add x8, sp, #416
+; CHECK-SD-NEXT: ld1 { v4.b }[1], [x8]
+; CHECK-SD-NEXT: add x8, sp, #120
+; CHECK-SD-NEXT: add x9, sp, #400
+; CHECK-SD-NEXT: ld1 { v0.b }[5], [x11]
+; CHECK-SD-NEXT: add x11, sp, #128
+; CHECK-SD-NEXT: ldr b22, [sp, #736]
+; CHECK-SD-NEXT: ld1 { v2.b }[4], [x12]
+; CHECK-SD-NEXT: add x12, sp, #424
+; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8]
+; CHECK-SD-NEXT: ld1 { v4.b }[2], [x12]
+; CHECK-SD-NEXT: add x12, sp, #152
+; CHECK-SD-NEXT: add x8, sp, #136
+; CHECK-SD-NEXT: ld1 { v5.b }[1], [x12]
+; CHECK-SD-NEXT: add x12, sp, #440
+; CHECK-SD-NEXT: ld1 { v0.b }[6], [x10]
+; CHECK-SD-NEXT: ld1 { v1.b }[6], [x11]
+; CHECK-SD-NEXT: add x11, sp, #288
+; CHECK-SD-NEXT: add x10, sp, #256
+; CHECK-SD-NEXT: ld1 { v4.b }[3], [x13]
+; CHECK-SD-NEXT: ld1 { v6.b }[1], [x11]
+; CHECK-SD-NEXT: add x11, sp, #296
+; CHECK-SD-NEXT: ld1 { v0.b }[7], [x9]
+; CHECK-SD-NEXT: add x9, sp, #160
+; CHECK-SD-NEXT: ld1 { v2.b }[5], [x10]
+; CHECK-SD-NEXT: ld1 { v5.b }[2], [x9]
+; CHECK-SD-NEXT: add x10, sp, #168
+; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8]
+; CHECK-SD-NEXT: ld1 { v4.b }[4], [x12]
+; CHECK-SD-NEXT: add x12, sp, #448
+; CHECK-SD-NEXT: ld1 { v6.b }[2], [x11]
+; CHECK-SD-NEXT: add x11, sp, #304
+; CHECK-SD-NEXT: add x8, sp, #464
+; CHECK-SD-NEXT: add x13, sp, #768
+; CHECK-SD-NEXT: ld1 { v5.b }[3], [x10]
+; CHECK-SD-NEXT: add x10, sp, #176
+; CHECK-SD-NEXT: add x9, sp, #264
+; CHECK-SD-NEXT: ld1 { v4.b }[5], [x12]
+; CHECK-SD-NEXT: add x12, sp, #456
+; CHECK-SD-NEXT: ld1 { v6.b }[3], [x11]
+; CHECK-SD-NEXT: add x11, sp, #760
+; CHECK-SD-NEXT: ld1 { v2.b }[6], [x9]
+; CHECK-SD-NEXT: add x9, sp, #272
+; CHECK-SD-NEXT: ld1 { v5.b }[4], [x10]
+; CHECK-SD-NEXT: add x10, sp, #312
+; CHECK-SD-NEXT: fmov s3, w0
+; CHECK-SD-NEXT: ld1 { v4.b }[6], [x12]
+; CHECK-SD-NEXT: ld1 { v6.b }[4], [x10]
+; CHECK-SD-NEXT: add x10, sp, #320
+; CHECK-SD-NEXT: add x12, sp, #680
+; CHECK-SD-NEXT: ld1 { v2.b }[7], [x9]
+; CHECK-SD-NEXT: add x9, sp, #184
+; CHECK-SD-NEXT: ld1 { v19.b }[1], [x12]
+; CHECK-SD-NEXT: add x12, sp, #776
+; CHECK-SD-NEXT: ld1 { v5.b }[5], [x9]
+; CHECK-SD-NEXT: ld1 { v4.b }[7], [x8]
+; CHECK-SD-NEXT: add x8, sp, #752
+; CHECK-SD-NEXT: ld1 { v6.b }[5], [x10]
+; CHECK-SD-NEXT: ld1 { v16.b }[1], [x8]
+; CHECK-SD-NEXT: add x10, sp, #24
+; CHECK-SD-NEXT: smull v22.8h, v22.8b, v21.8b
+; CHECK-SD-NEXT: ld1 { v7.b }[1], [x10]
+; CHECK-SD-NEXT: add x10, sp, #496
+; CHECK-SD-NEXT: mov v3.b[1], w1
+; CHECK-SD-NEXT: add x9, sp, #192
+; CHECK-SD-NEXT: ldr b20, [sp, #472]
+; CHECK-SD-NEXT: ldr b23, [sp, #208]
+; CHECK-SD-NEXT: ld1 { v16.b }[2], [x11]
+; CHECK-SD-NEXT: add x11, sp, #488
+; CHECK-SD-NEXT: ld1 { v5.b }[6], [x9]
+; CHECK-SD-NEXT: ld1 { v17.b }[1], [x11]
+; CHECK-SD-NEXT: add x11, sp, #944
+; CHECK-SD-NEXT: add x9, sp, #328
+; CHECK-SD-NEXT: ld1 { v18.b }[1], [x11]
+; CHECK-SD-NEXT: add x11, sp, #688
+; CHECK-SD-NEXT: ld1 { v6.b }[6], [x9]
+; CHECK-SD-NEXT: ld1 { v16.b }[3], [x13]
+; CHECK-SD-NEXT: ld1 { v19.b }[2], [x11]
+; CHECK-SD-NEXT: add x11, sp, #504
+; CHECK-SD-NEXT: ld1 { v17.b }[2], [x10]
+; CHECK-SD-NEXT: add x10, sp, #952
+; CHECK-SD-NEXT: add x13, sp, #784
+; CHECK-SD-NEXT: ld1 { v18.b }[2], [x10]
+; CHECK-SD-NEXT: add x10, sp, #32
+; CHECK-SD-NEXT: add x9, sp, #40
+; CHECK-SD-NEXT: ld1 { v16.b }[4], [x12]
+; CHECK-SD-NEXT: add x12, sp, #696
+; CHECK-SD-NEXT: ld1 { v7.b }[2], [x10]
+; CHECK-SD-NEXT: ld1 { v17.b }[3], [x11]
+; CHECK-SD-NEXT: add x11, sp, #960
+; CHECK-SD-NEXT: ld1 { v19.b }[3], [x12]
+; CHECK-SD-NEXT: ld1 { v18.b }[3], [x11]
+; CHECK-SD-NEXT: add x10, sp, #512
+; CHECK-SD-NEXT: add x11, sp, #704
+; CHECK-SD-NEXT: ld1 { v16.b }[5], [x13]
+; CHECK-SD-NEXT: add x12, sp, #792
+; CHECK-SD-NEXT: sshll v24.4s, v22.4h, #0
+; CHECK-SD-NEXT: ld1 { v17.b }[4], [x10]
+; CHECK-SD-NEXT: add x10, sp, #968
+; CHECK-SD-NEXT: ld1 { v19.b }[4], [x11]
+; CHECK-SD-NEXT: ld1 { v18.b }[4], [x10]
+; CHECK-SD-NEXT: add x10, sp, #520
+; CHECK-SD-NEXT: add x11, sp, #976
+; CHECK-SD-NEXT: ld1 { v16.b }[6], [x12]
+; CHECK-SD-NEXT: add x12, sp, #712
+; CHECK-SD-NEXT: smull v20.8h, v23.8b, v20.8b
+; CHECK-SD-NEXT: ld1 { v17.b }[5], [x10]
+; CHECK-SD-NEXT: ld1 { v19.b }[5], [x12]
+; CHECK-SD-NEXT: add x12, sp, #720
+; CHECK-SD-NEXT: ld1 { v18.b }[5], [x11]
+; CHECK-SD-NEXT: add x11, sp, #528
+; CHECK-SD-NEXT: add x10, sp, #800
+; CHECK-SD-NEXT: ld1 { v16.b }[7], [x10]
+; CHECK-SD-NEXT: add x10, sp, #536
+; CHECK-SD-NEXT: ldr b22, [sp, #872]
+; CHECK-SD-NEXT: ld1 { v17.b }[6], [x11]
+; CHECK-SD-NEXT: add x11, sp, #984
+; CHECK-SD-NEXT: ld1 { v19.b }[6], [x12]
+; CHECK-SD-NEXT: ld1 { v18.b }[6], [x11]
+; CHECK-SD-NEXT: add x11, sp, #992
+; CHECK-SD-NEXT: add x12, sp, #728
+; CHECK-SD-NEXT: ldr b23, [sp, #608]
+; CHECK-SD-NEXT: ld1 { v7.b }[3], [x9]
+; CHECK-SD-NEXT: add x9, sp, #880
+; CHECK-SD-NEXT: ld1 { v17.b }[7], [x10]
+; CHECK-SD-NEXT: ld1 { v19.b }[7], [x12]
+; CHECK-SD-NEXT: add x10, sp, #816
+; CHECK-SD-NEXT: ld1 { v18.b }[7], [x11]
+; CHECK-SD-NEXT: add x11, sp, #552
+; CHECK-SD-NEXT: add x12, sp, #616
+; CHECK-SD-NEXT: mov v3.b[2], w2
+; CHECK-SD-NEXT: ld1 { v22.b }[1], [x9]
+; CHECK-SD-NEXT: ld1 { v23.b }[1], [x12]
+; CHECK-SD-NEXT: smull v16.8h, v17.8b, v16.8b
+; CHECK-SD-NEXT: add x12, sp, #560
+; CHECK-SD-NEXT: add x9, sp, #888
+; CHECK-SD-NEXT: smull v17.8h, v19.8b, v18.8b
+; CHECK-SD-NEXT: ldr b18, [sp, #808]
+; CHECK-SD-NEXT: ldr b19, [sp, #544]
+; CHECK-SD-NEXT: add x13, sp, #624
+; CHECK-SD-NEXT: ld1 { v22.b }[2], [x9]
+; CHECK-SD-NEXT: add x9, sp, #896
+; CHECK-SD-NEXT: ld1 { v18.b }[1], [x10]
+; CHECK-SD-NEXT: ld1 { v19.b }[1], [x11]
+; CHECK-SD-NEXT: add x11, sp, #824
+; CHECK-SD-NEXT: add x10, sp, #48
+; CHECK-SD-NEXT: ld1 { v23.b }[2], [x13]
+; CHECK-SD-NEXT: mov v3.b[3], w3
+; CHECK-SD-NEXT: ld1 { v7.b }[4], [x10]
+; CHECK-SD-NEXT: add x10, sp, #832
+; CHECK-SD-NEXT: ld1 { v22.b }[3], [x9]
+; CHECK-SD-NEXT: ld1 { v18.b }[2], [x11]
+; CHECK-SD-NEXT: ld1 { v19.b }[2], [x12]
+; CHECK-SD-NEXT: add x11, sp, #568
+; CHECK-SD-NEXT: add x12, sp, #632
+; CHECK-SD-NEXT: add x9, sp, #904
+; CHECK-SD-NEXT: add x13, sp, #640
+; CHECK-SD-NEXT: ld1 { v23.b }[3], [x12]
+; CHECK-SD-NEXT: add x12, sp, #576
+; CHECK-SD-NEXT: mov v3.b[4], w4
+; CHECK-SD-NEXT: ld1 { v18.b }[3], [x10]
+; CHECK-SD-NEXT: ld1 { v19.b }[3], [x11]
+; CHECK-SD-NEXT: add x11, sp, #840
+; CHECK-SD-NEXT: add x10, sp, #56
+; CHECK-SD-NEXT: ld1 { v22.b }[4], [x9]
+; CHECK-SD-NEXT: add x9, sp, #912
+; CHECK-SD-NEXT: ld1 { v23.b }[4], [x13]
+; CHECK-SD-NEXT: ld1 { v7.b }[5], [x10]
+; CHECK-SD-NEXT: add x10, sp, #848
+; CHECK-SD-NEXT: ld1 { v18.b }[4], [x11]
+; CHECK-SD-NEXT: ld1 { v19.b }[4], [x12]
+; CHECK-SD-NEXT: add x11, sp, #584
+; CHECK-SD-NEXT: add x12, sp, #648
+; CHECK-SD-NEXT: mov v3.b[5], w5
+; CHECK-SD-NEXT: ld1 { v22.b }[5], [x9]
+; CHECK-SD-NEXT: ld1 { v23.b }[5], [x12]
+; CHECK-SD-NEXT: add x12, sp, #592
+; CHECK-SD-NEXT: movi v21.2d, #0000000000000000
+; CHECK-SD-NEXT: ld1 { v18.b }[5], [x10]
+; CHECK-SD-NEXT: ld1 { v19.b }[5], [x11]
+; CHECK-SD-NEXT: add x11, sp, #856
+; CHECK-SD-NEXT: add x9, sp, #920
+; CHECK-SD-NEXT: add x13, sp, #656
+; CHECK-SD-NEXT: add x10, sp, #64
+; CHECK-SD-NEXT: ld1 { v22.b }[6], [x9]
+; CHECK-SD-NEXT: ld1 { v23.b }[6], [x13]
+; CHECK-SD-NEXT: mov v3.b[6], w6
+; CHECK-SD-NEXT: ld1 { v18.b }[6], [x11]
+; CHECK-SD-NEXT: ld1 { v19.b }[6], [x12]
+; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10]
+; CHECK-SD-NEXT: add x10, sp, #864
+; CHECK-SD-NEXT: add x11, sp, #600
+; CHECK-SD-NEXT: add x9, sp, #928
+; CHECK-SD-NEXT: add x12, sp, #664
+; CHECK-SD-NEXT: mov v21.s[0], v24.s[0]
+; CHECK-SD-NEXT: ld1 { v22.b }[7], [x9]
+; CHECK-SD-NEXT: ld1 { v18.b }[7], [x10]
+; CHECK-SD-NEXT: ld1 { v19.b }[7], [x11]
+; CHECK-SD-NEXT: ld1 { v23.b }[7], [x12]
+; CHECK-SD-NEXT: add x8, sp, #200
+; CHECK-SD-NEXT: mov v3.b[7], w7
+; CHECK-SD-NEXT: add x10, sp, #336
+; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8]
+; CHECK-SD-NEXT: add x8, sp, #72
+; CHECK-SD-NEXT: ld1 { v6.b }[7], [x10]
+; CHECK-SD-NEXT: smull v18.8h, v19.8b, v18.8b
; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
-; CHECK-SD-NEXT: usdot v16.4s, v1.16b, v3.16b
-; CHECK-SD-NEXT: usdot v18.4s, v0.16b, v2.16b
-; CHECK-SD-NEXT: usdot v17.4s, v4.16b, v6.16b
-; CHECK-SD-NEXT: usdot v19.4s, v5.16b, v7.16b
-; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s
-; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s
+; CHECK-SD-NEXT: ld1 { v7.b }[7], [x8]
+; CHECK-SD-NEXT: smull v22.8h, v23.8b, v22.8b
+; CHECK-SD-NEXT: sshll v20.4s, v20.4h, #0
+; CHECK-SD-NEXT: smull v0.8h, v1.8b, v0.8b
+; CHECK-SD-NEXT: saddw v1.4s, v21.4s, v16.4h
+; CHECK-SD-NEXT: smull v2.8h, v3.8b, v2.8b
+; CHECK-SD-NEXT: smull v3.8h, v5.8b, v4.8b
+; CHECK-SD-NEXT: smull v4.8h, v7.8b, v6.8b
+; CHECK-SD-NEXT: mov v19.s[0], v20.s[0]
+; CHECK-SD-NEXT: saddl2 v5.4s, v18.8h, v17.8h
+; CHECK-SD-NEXT: saddl v7.4s, v18.4h, v17.4h
+; CHECK-SD-NEXT: saddl2 v6.4s, v16.8h, v22.8h
+; CHECK-SD-NEXT: saddw v1.4s, v1.4s, v22.4h
+; CHECK-SD-NEXT: saddl2 v17.4s, v2.8h, v0.8h
+; CHECK-SD-NEXT: saddl2 v16.4s, v4.8h, v3.8h
+; CHECK-SD-NEXT: saddl v3.4s, v4.4h, v3.4h
+; CHECK-SD-NEXT: saddw v2.4s, v19.4s, v2.4h
+; CHECK-SD-NEXT: add v5.4s, v6.4s, v5.4s
+; CHECK-SD-NEXT: add v1.4s, v1.4s, v7.4s
+; CHECK-SD-NEXT: add v6.4s, v17.4s, v16.4s
+; CHECK-SD-NEXT: saddw v0.4s, v2.4s, v0.4h
+; CHECK-SD-NEXT: add v1.4s, v1.4s, v5.4s
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v3.4s
+; CHECK-SD-NEXT: add v1.4s, v6.4s, v1.4s
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-SD-NEXT: ret
;
-; CHECK-GI-LABEL: test_usdot_v32i8_double:
+; CHECK-GI-LABEL: test_sdot_v33i8_double:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
-; CHECK-GI-NEXT: .cfi_offset b8, -8
-; CHECK-GI-NEXT: .cfi_offset b9, -16
-; CHECK-GI-NEXT: .cfi_offset b10, -24
-; CHECK-GI-NEXT: .cfi_offset b11, -32
-; CHECK-GI-NEXT: .cfi_offset b12, -40
-; CHECK-GI-NEXT: .cfi_offset b13, -48
-; CHECK-GI-NEXT: .cfi_offset b14, -56
-; CHECK-GI-NEXT: .cfi_offset b15, -64
-; CHECK-GI-NEXT: ushll v16.8h, v0.8b, #0
-; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-NEXT: ushll v17.8h, v1.8b, #0
-; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-NEXT: sshll v18.8h, v2.8b, #0
-; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0
-; CHECK-GI-NEXT: sshll v19.8h, v3.8b, #0
-; CHECK-GI-NEXT: sshll2 v3.8h, v3.16b, #0
-; CHECK-GI-NEXT: ushll v27.8h, v4.8b, #0
-; CHECK-GI-NEXT: ushll2 v4.8h, v4.16b, #0
-; CHECK-GI-NEXT: ushll v28.8h, v5.8b, #0
-; CHECK-GI-NEXT: sshll v29.8h, v6.8b, #0
-; CHECK-GI-NEXT: sshll2 v6.8h, v6.16b, #0
-; CHECK-GI-NEXT: ushll2 v5.8h, v5.16b, #0
-; CHECK-GI-NEXT: sshll v30.8h, v7.8b, #0
-; CHECK-GI-NEXT: sshll2 v7.8h, v7.16b, #0
-; CHECK-GI-NEXT: ushll2 v20.4s, v16.8h, #0
-; CHECK-GI-NEXT: ushll2 v21.4s, v0.8h, #0
-; CHECK-GI-NEXT: ushll2 v22.4s, v17.8h, #0
-; CHECK-GI-NEXT: ushll2 v23.4s, v1.8h, #0
-; CHECK-GI-NEXT: sshll2 v24.4s, v18.8h, #0
-; CHECK-GI-NEXT: sshll2 v25.4s, v2.8h, #0
-; CHECK-GI-NEXT: sshll2 v26.4s, v19.8h, #0
-; CHECK-GI-NEXT: sshll2 v31.4s, v3.8h, #0
-; CHECK-GI-NEXT: ushll2 v8.4s, v27.8h, #0
-; CHECK-GI-NEXT: ushll2 v9.4s, v4.8h, #0
-; CHECK-GI-NEXT: ushll2 v10.4s, v28.8h, #0
-; CHECK-GI-NEXT: sshll2 v11.4s, v29.8h, #0
-; CHECK-GI-NEXT: sshll2 v12.4s, v6.8h, #0
-; CHECK-GI-NEXT: ushll2 v13.4s, v5.8h, #0
-; CHECK-GI-NEXT: sshll2 v14.4s, v30.8h, #0
-; CHECK-GI-NEXT: sshll2 v15.4s, v7.8h, #0
-; CHECK-GI-NEXT: mul v20.4s, v20.4s, v24.4s
-; CHECK-GI-NEXT: mul v21.4s, v21.4s, v25.4s
-; CHECK-GI-NEXT: mul v22.4s, v22.4s, v26.4s
-; CHECK-GI-NEXT: mul v23.4s, v23.4s, v31.4s
-; CHECK-GI-NEXT: mul v24.4s, v8.4s, v11.4s
-; CHECK-GI-NEXT: mul v25.4s, v9.4s, v12.4s
-; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0
+; CHECK-GI-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
+; CHECK-GI-NEXT: .cfi_offset b10, -40
+; CHECK-GI-NEXT: .cfi_offset b11, -48
+; CHECK-GI-NEXT: .cfi_offset b12, -56
+; CHECK-GI-NEXT: .cfi_offset b13, -64
+; CHECK-GI-NEXT: .cfi_offset b14, -72
+; CHECK-GI-NEXT: .cfi_offset b15, -80
+; CHECK-GI-NEXT: ldr w8, [sp, #80]
+; CHECK-GI-NEXT: sxtb w9, w0
+; CHECK-GI-NEXT: ldr w10, [sp, #112]
+; CHECK-GI-NEXT: sxtb w11, w4
+; CHECK-GI-NEXT: sxtb w13, w7
+; CHECK-GI-NEXT: sxtb w12, w3
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v0.s[0], w9
+; CHECK-GI-NEXT: sxtb w9, w10
+; CHECK-GI-NEXT: mov v3.s[0], w11
+; CHECK-GI-NEXT: sxtb w10, w1
+; CHECK-GI-NEXT: sxtb w11, w5
+; CHECK-GI-NEXT: mov v1.s[0], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #88]
+; CHECK-GI-NEXT: mov v5.s[0], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #120]
+; CHECK-GI-NEXT: ldr w14, [sp, #168]
+; CHECK-GI-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v0.s[1], w10
+; CHECK-GI-NEXT: sxtb w10, w2
+; CHECK-GI-NEXT: mov v3.s[1], w11
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: sxtb w11, w6
+; CHECK-GI-NEXT: mov v1.s[1], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #96]
+; CHECK-GI-NEXT: mov v5.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #128]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v0.s[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #104]
+; CHECK-GI-NEXT: mov v3.s[2], w11
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr w11, [sp, #136]
+; CHECK-GI-NEXT: mov v1.s[2], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #144]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v5.s[2], w9
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: ldr w9, [sp, #152]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v0.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #160]
+; CHECK-GI-NEXT: mov v3.s[3], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #176]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v1.s[3], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #208]
+; CHECK-GI-NEXT: mov v2.s[0], w8
+; CHECK-GI-NEXT: mov v5.s[3], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #240]
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: sxtb w8, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #184]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v6.s[0], w13
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: ldr w13, [sp, #264]
+; CHECK-GI-NEXT: mov v4.s[0], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #216]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v7.s[0], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #248]
+; CHECK-GI-NEXT: mov v2.s[1], w9
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: ldr w9, [sp, #192]
+; CHECK-GI-NEXT: mov v6.s[1], w10
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: ldr w10, [sp, #200]
+; CHECK-GI-NEXT: mov v4.s[1], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #224]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v7.s[1], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #256]
+; CHECK-GI-NEXT: mov v2.s[2], w12
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: ldr w12, [sp, #232]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v6.s[2], w9
+; CHECK-GI-NEXT: sxtb w9, w11
+; CHECK-GI-NEXT: sxtb w11, w14
+; CHECK-GI-NEXT: mov v4.s[2], w8
+; CHECK-GI-NEXT: ldr w14, [sp, #280]
+; CHECK-GI-NEXT: ldr w8, [sp, #272]
+; CHECK-GI-NEXT: mov v7.s[2], w9
+; CHECK-GI-NEXT: mov v2.s[3], w11
+; CHECK-GI-NEXT: sxtb w11, w12
+; CHECK-GI-NEXT: sxtb w12, w13
+; CHECK-GI-NEXT: sxtb w13, w14
+; CHECK-GI-NEXT: ldr w9, [sp, #288]
+; CHECK-GI-NEXT: mov v6.s[3], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #312]
+; CHECK-GI-NEXT: ldr w14, [sp, #544]
+; CHECK-GI-NEXT: mov v4.s[3], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #344]
+; CHECK-GI-NEXT: mov v16.s[0], w13
+; CHECK-GI-NEXT: mov v7.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #376]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr w13, [sp, #296]
+; CHECK-GI-NEXT: mov v19.s[0], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #320]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v17.s[0], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #352]
+; CHECK-GI-NEXT: mov v16.s[1], w9
+; CHECK-GI-NEXT: mov v21.s[0], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #384]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: ldr w9, [sp, #304]
+; CHECK-GI-NEXT: mov v19.s[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #328]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v17.s[1], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #360]
+; CHECK-GI-NEXT: mov v16.s[2], w13
+; CHECK-GI-NEXT: mov v21.s[1], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #392]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: ldr w13, [sp, #336]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v19.s[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #368]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v17.s[2], w11
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: ldr w11, [sp, #400]
+; CHECK-GI-NEXT: mov v21.s[2], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #408]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v16.s[3], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #416]
+; CHECK-GI-NEXT: mov v19.s[3], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #440]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v17.s[3], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #472]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v18.s[0], w12
+; CHECK-GI-NEXT: mov v21.s[3], w11
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w11, [sp, #504]
+; CHECK-GI-NEXT: ldr w12, [sp, #424]
+; CHECK-GI-NEXT: mov v22.s[0], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #448]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v20.s[0], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #480]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v18.s[1], w9
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v23.s[0], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #512]
+; CHECK-GI-NEXT: mov v22.s[1], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #456]
+; CHECK-GI-NEXT: ldr w9, [sp, #432]
+; CHECK-GI-NEXT: mov v20.s[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #488]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v18.s[2], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #464]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v23.s[1], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #520]
+; CHECK-GI-NEXT: mov v22.s[2], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #496]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v20.s[2], w10
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v18.s[3], w9
+; CHECK-GI-NEXT: sxtb w9, w13
+; CHECK-GI-NEXT: ldr w10, [sp, #528]
+; CHECK-GI-NEXT: mov v23.s[2], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #536]
+; CHECK-GI-NEXT: sxtb w13, w14
+; CHECK-GI-NEXT: mov v22.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #576]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v20.s[3], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #608]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v24.s[0], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #560]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mul w8, w8, w11
+; CHECK-GI-NEXT: mov v23.s[3], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #552]
+; CHECK-GI-NEXT: ldr w11, [sp, #584]
+; CHECK-GI-NEXT: mov v27.s[0], w12
+; CHECK-GI-NEXT: mov v26.s[0], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #616]
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v25.s[0], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #640]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr w12, [sp, #568]
+; CHECK-GI-NEXT: mov v24.s[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #592]
+; CHECK-GI-NEXT: mov v27.s[1], w11
+; CHECK-GI-NEXT: mov v26.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #624]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w11, [sp, #600]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v28.s[0], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #648]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v24.s[2], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #632]
+; CHECK-GI-NEXT: mov v27.s[2], w10
+; CHECK-GI-NEXT: mov v26.s[2], w9
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: ldr w10, [sp, #656]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: ldr w9, [sp, #664]
+; CHECK-GI-NEXT: mov v28.s[1], w8
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w8, [sp, #680]
+; CHECK-GI-NEXT: mov v24.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #672]
+; CHECK-GI-NEXT: mov v27.s[3], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #704]
+; CHECK-GI-NEXT: mov v26.s[3], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #736]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v28.s[2], w10
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v29.s[0], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #688]
+; CHECK-GI-NEXT: ldr w10, [sp, #696]
+; CHECK-GI-NEXT: mov v31.s[0], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #712]
+; CHECK-GI-NEXT: mov v30.s[0], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #744]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: ldr w14, [sp, #776]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v28.s[3], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #768]
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v29.s[1], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #720]
+; CHECK-GI-NEXT: mov v31.s[1], w11
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr w11, [sp, #728]
+; CHECK-GI-NEXT: mov v30.s[1], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #752]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v8.s[0], w9
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mul v3.4s, v3.4s, v19.4s
+; CHECK-GI-NEXT: sxtb w9, w13
+; CHECK-GI-NEXT: mov v29.s[2], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #760]
+; CHECK-GI-NEXT: mov v31.s[2], w8
+; CHECK-GI-NEXT: sxtb w8, w10
+; CHECK-GI-NEXT: sxtb w10, w14
+; CHECK-GI-NEXT: mov v30.s[2], w9
+; CHECK-GI-NEXT: ldr w14, [sp, #808]
+; CHECK-GI-NEXT: ldr w13, [sp, #784]
+; CHECK-GI-NEXT: mov v8.s[1], w10
+; CHECK-GI-NEXT: sxtb w10, w12
+; CHECK-GI-NEXT: ldr w9, [sp, #792]
+; CHECK-GI-NEXT: sxtb w12, w14
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mul v5.4s, v5.4s, v21.4s
+; CHECK-GI-NEXT: mov v31.s[3], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #840]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v30.s[3], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #872]
+; CHECK-GI-NEXT: mov v9.s[0], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #816]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v8.s[2], w13
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w13, [sp, #824]
+; CHECK-GI-NEXT: mov v21.s[0], wzr
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v11.s[0], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #848]
+; CHECK-GI-NEXT: mov v10.s[0], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #880]
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v9.s[1], w12
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v8.s[3], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #904]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w12, [sp, #832]
+; CHECK-GI-NEXT: mov v11.s[1], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #856]
+; CHECK-GI-NEXT: mov v29.s[3], w8
+; CHECK-GI-NEXT: mov v10.s[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #888]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v9.s[2], w13
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v12.s[0], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #912]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v11.s[2], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #896]
+; CHECK-GI-NEXT: ldr w13, [sp, #864]
+; CHECK-GI-NEXT: mov v10.s[2], w10
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr w10, [sp, #920]
+; CHECK-GI-NEXT: mov v9.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #968]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v12.s[1], w9
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: ldr w8, [sp, #800]
+; CHECK-GI-NEXT: ldr w9, [sp, #928]
+; CHECK-GI-NEXT: mov v10.s[3], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #1032]
+; CHECK-GI-NEXT: mov v11.s[3], w13
+; CHECK-GI-NEXT: mov v14.s[0], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #976]
+; CHECK-GI-NEXT: ldr w13, [sp, #936]
+; CHECK-GI-NEXT: mov v12.s[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #1000]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v21.s[1], wzr
+; CHECK-GI-NEXT: mov v15.s[0], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #1040]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v14.s[1], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #984]
+; CHECK-GI-NEXT: mov v13.s[0], w13
+; CHECK-GI-NEXT: mov v19.s[0], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #1008]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: ldr w13, [sp, #944]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v15.s[1], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #1048]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v14.s[2], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #1064]
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v19.s[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #992]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v13.s[1], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #952]
+; CHECK-GI-NEXT: mov v15.s[2], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #1016]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mul w8, w8, w12
+; CHECK-GI-NEXT: ldr w12, [sp, #1056]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: mov v12.s[3], w9
+; CHECK-GI-NEXT: sxtb w9, w11
+; CHECK-GI-NEXT: mov v14.s[3], w10
+; CHECK-GI-NEXT: mov v21.s[2], wzr
+; CHECK-GI-NEXT: sxtb w10, w12
+; CHECK-GI-NEXT: mul v6.4s, v6.4s, v22.4s
+; CHECK-GI-NEXT: mov v22.s[0], w8
+; CHECK-GI-NEXT: mov v13.s[2], w13
+; CHECK-GI-NEXT: mov v19.s[2], w9
+; CHECK-GI-NEXT: ldr w8, [sp, #960]
+; CHECK-GI-NEXT: mov v15.s[3], w10
+; CHECK-GI-NEXT: ldr w9, [sp, #1024]
+; CHECK-GI-NEXT: mov v25.s[1], wzr
+; CHECK-GI-NEXT: mul v7.4s, v7.4s, v23.4s
+; CHECK-GI-NEXT: mov v21.s[3], wzr
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mul v23.4s, v27.4s, v11.4s
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v22.s[1], wzr
+; CHECK-GI-NEXT: mul v27.4s, v28.4s, v12.4s
+; CHECK-GI-NEXT: mul v28.4s, v31.4s, v14.4s
+; CHECK-GI-NEXT: mul v31.4s, v8.4s, v15.4s
+; CHECK-GI-NEXT: mov v13.s[3], w8
+; CHECK-GI-NEXT: mov v19.s[3], w9
+; CHECK-GI-NEXT: mla v3.4s, v0.4s, v16.4s
+; CHECK-GI-NEXT: mov v25.s[2], wzr
+; CHECK-GI-NEXT: add v0.4s, v21.4s, v21.4s
+; CHECK-GI-NEXT: mla v5.4s, v1.4s, v17.4s
+; CHECK-GI-NEXT: mla v6.4s, v2.4s, v18.4s
+; CHECK-GI-NEXT: mov v22.s[2], wzr
+; CHECK-GI-NEXT: mla v7.4s, v4.4s, v20.4s
+; CHECK-GI-NEXT: mla v23.4s, v24.4s, v9.4s
+; CHECK-GI-NEXT: mla v27.4s, v26.4s, v10.4s
+; CHECK-GI-NEXT: mla v28.4s, v29.4s, v13.4s
+; CHECK-GI-NEXT: mla v31.4s, v30.4s, v19.4s
+; CHECK-GI-NEXT: add v1.4s, v21.4s, v0.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT: mov v25.s[3], wzr
+; CHECK-GI-NEXT: add v2.4s, v3.4s, v5.4s
+; CHECK-GI-NEXT: mov v22.s[3], wzr
+; CHECK-GI-NEXT: add v3.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: add v1.4s, v23.4s, v27.4s
+; CHECK-GI-NEXT: add v4.4s, v28.4s, v31.4s
; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mul v26.4s, v10.4s, v14.4s
+; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s
; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT: mul v31.4s, v13.4s, v15.4s
+; CHECK-GI-NEXT: add v3.4s, v25.4s, v0.4s
+; CHECK-GI-NEXT: add v0.4s, v22.4s, v0.4s
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v4.4s
; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-GI-NEXT: ushll v17.4s, v17.4h, #0
-; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-GI-NEXT: sshll v18.4s, v18.4h, #0
-; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-GI-NEXT: sshll v19.4s, v19.4h, #0
-; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
-; CHECK-GI-NEXT: ushll v27.4s, v27.4h, #0
-; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0
-; CHECK-GI-NEXT: ushll v28.4s, v28.4h, #0
-; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0
-; CHECK-GI-NEXT: sshll v29.4s, v29.4h, #0
-; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0
-; CHECK-GI-NEXT: sshll v30.4s, v30.4h, #0
-; CHECK-GI-NEXT: sshll v7.4s, v7.4h, #0
-; CHECK-GI-NEXT: mla v20.4s, v16.4s, v18.4s
-; CHECK-GI-NEXT: mla v21.4s, v0.4s, v2.4s
-; CHECK-GI-NEXT: mla v22.4s, v17.4s, v19.4s
-; CHECK-GI-NEXT: mla v23.4s, v1.4s, v3.4s
-; CHECK-GI-NEXT: mla v24.4s, v27.4s, v29.4s
-; CHECK-GI-NEXT: mla v25.4s, v4.4s, v6.4s
-; CHECK-GI-NEXT: mla v26.4s, v28.4s, v30.4s
-; CHECK-GI-NEXT: mla v31.4s, v5.4s, v7.4s
-; CHECK-GI-NEXT: add v0.4s, v20.4s, v21.4s
-; CHECK-GI-NEXT: add v1.4s, v22.4s, v23.4s
-; CHECK-GI-NEXT: add v2.4s, v24.4s, v25.4s
-; CHECK-GI-NEXT: add v3.4s, v26.4s, v31.4s
-; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: addv s1, v2.4s
; CHECK-GI-NEXT: addv s0, v0.4s
-; CHECK-GI-NEXT: addv s1, v1.4s
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: fmov w9, s0
; CHECK-GI-NEXT: add w0, w8, w9
-; CHECK-GI-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-GI-NEXT: ret
-entry:
- %az = zext <32 x i8> %a to <32 x i32>
- %bz = sext <32 x i8> %b to <32 x i32>
- %m1 = mul nuw nsw <32 x i32> %az, %bz
- %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m1)
- %cz = zext <32 x i8> %c to <32 x i32>
- %dz = sext <32 x i8> %d to <32 x i32>
- %m2 = mul nuw nsw <32 x i32> %cz, %dz
- %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m2)
- %x = add i32 %r1, %r2
- ret i32 %x
-}
-
-
-define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
-; CHECK-LABEL: test_udot_v33i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #32]
-; CHECK-NEXT: ldr b1, [x1, #32]
-; CHECK-NEXT: movi v7.2d, #0000000000000000
-; CHECK-NEXT: ldp q3, q4, [x1]
-; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT: ldp q1, q2, [x0]
-; CHECK-NEXT: umull v5.8h, v4.8b, v2.8b
-; CHECK-NEXT: umull v6.8h, v3.8b, v1.8b
-; CHECK-NEXT: umull2 v2.8h, v4.16b, v2.16b
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-NEXT: umull2 v1.8h, v3.16b, v1.16b
-; CHECK-NEXT: mov v7.s[0], v0.s[0]
-; CHECK-NEXT: uaddl2 v3.4s, v6.8h, v5.8h
-; CHECK-NEXT: uaddl2 v0.4s, v1.8h, v2.8h
-; CHECK-NEXT: uaddl v1.4s, v1.4h, v2.4h
-; CHECK-NEXT: add v0.4s, v3.4s, v0.4s
-; CHECK-NEXT: uaddw v2.4s, v7.4s, v6.4h
-; CHECK-NEXT: uaddw v2.4s, v2.4s, v5.4h
-; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: add v0.4s, v2.4s, v0.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: add w0, w8, w2
-; CHECK-NEXT: ret
-entry:
- %0 = load <33 x i8>, ptr %a
- %1 = zext <33 x i8> %0 to <33 x i32>
- %2 = load <33 x i8>, ptr %b
- %3 = zext <33 x i8> %2 to <33 x i32>
- %4 = mul nuw nsw <33 x i32> %3, %1
- %5 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %4)
- %op.extra = add i32 %5, %sum
- ret i32 %op.extra
-}
-
-define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) {
-; CHECK-LABEL: test_udot_v33i8_nomla:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b1, [x0, #32]
-; CHECK-NEXT: ldp q3, q2, [x0]
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: ushll v4.8h, v2.8b, #0
-; CHECK-NEXT: ushll v5.8h, v3.8b, #0
-; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0
-; CHECK-NEXT: ushll2 v3.8h, v3.16b, #0
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: uaddl2 v6.4s, v5.8h, v4.8h
-; CHECK-NEXT: mov v0.s[0], v1.s[0]
-; CHECK-NEXT: uaddl2 v1.4s, v3.8h, v2.8h
-; CHECK-NEXT: uaddl v2.4s, v3.4h, v2.4h
-; CHECK-NEXT: add v1.4s, v6.4s, v1.4s
-; CHECK-NEXT: uaddw v0.4s, v0.4s, v5.4h
-; CHECK-NEXT: add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT: uaddw v0.4s, v0.4s, v4.4h
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
-entry:
- %0 = load <33 x i8>, ptr %a1
- %1 = zext <33 x i8> %0 to <33 x i32>
- %2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %1)
- ret i32 %2
-}
-define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
-; CHECK-LABEL: test_sdot_v33i8:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #32]
-; CHECK-NEXT: ldr b1, [x1, #32]
-; CHECK-NEXT: movi v7.2d, #0000000000000000
-; CHECK-NEXT: ldp q3, q4, [x1]
-; CHECK-NEXT: smull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT: ldp q1, q2, [x0]
-; CHECK-NEXT: smull v5.8h, v4.8b, v2.8b
-; CHECK-NEXT: smull v6.8h, v3.8b, v1.8b
-; CHECK-NEXT: smull2 v2.8h, v4.16b, v2.16b
-; CHECK-NEXT: sshll v0.4s, v0.4h, #0
-; CHECK-NEXT: smull2 v1.8h, v3.16b, v1.16b
-; CHECK-NEXT: mov v7.s[0], v0.s[0]
-; CHECK-NEXT: saddl2 v3.4s, v6.8h, v5.8h
-; CHECK-NEXT: saddl2 v0.4s, v1.8h, v2.8h
-; CHECK-NEXT: saddl v1.4s, v1.4h, v2.4h
-; CHECK-NEXT: add v0.4s, v3.4s, v0.4s
-; CHECK-NEXT: saddw v2.4s, v7.4s, v6.4h
-; CHECK-NEXT: saddw v2.4s, v2.4s, v5.4h
-; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: add v0.4s, v2.4s, v0.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: add w0, w8, w2
-; CHECK-NEXT: ret
-entry:
- %0 = load <33 x i8>, ptr %a
- %1 = sext <33 x i8> %0 to <33 x i32>
- %2 = load <33 x i8>, ptr %b
- %3 = sext <33 x i8> %2 to <33 x i32>
- %4 = mul nsw <33 x i32> %3, %1
- %5 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %4)
- %op.extra = add nsw i32 %5, %sum
- ret i32 %op.extra
-}
-
-define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 x i8> %d) {
-; CHECK-LABEL: test_sdot_v33i8_double:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ldr b0, [sp, #344]
-; CHECK-NEXT: add x8, sp, #352
-; CHECK-NEXT: ldr b1, [sp, #80]
-; CHECK-NEXT: ldr b2, [sp, #216]
-; CHECK-NEXT: add x9, sp, #96
-; CHECK-NEXT: add x10, sp, #104
-; CHECK-NEXT: ld1 { v0.b }[1], [x8]
-; CHECK-NEXT: add x8, sp, #88
-; CHECK-NEXT: ldr b4, [sp, #408]
-; CHECK-NEXT: ld1 { v1.b }[1], [x8]
-; CHECK-NEXT: add x8, sp, #360
-; CHECK-NEXT: add x12, sp, #248
-; CHECK-NEXT: add x13, sp, #432
-; CHECK-NEXT: add x11, sp, #384
-; CHECK-NEXT: ldr b5, [sp, #144]
-; CHECK-NEXT: ld1 { v0.b }[2], [x8]
-; CHECK-NEXT: add x8, sp, #224
-; CHECK-NEXT: ldr b6, [sp, #280]
-; CHECK-NEXT: ld1 { v2.b }[1], [x8]
-; CHECK-NEXT: ld1 { v1.b }[2], [x9]
-; CHECK-NEXT: add x8, sp, #368
-; CHECK-NEXT: add x9, sp, #232
-; CHECK-NEXT: ldr b16, [sp, #744]
-; CHECK-NEXT: ldr b17, [sp, #480]
-; CHECK-NEXT: ld1 { v0.b }[3], [x8]
-; CHECK-NEXT: add x8, sp, #376
-; CHECK-NEXT: ldr b18, [sp, #936]
-; CHECK-NEXT: ld1 { v2.b }[2], [x9]
-; CHECK-NEXT: ld1 { v1.b }[3], [x10]
-; CHECK-NEXT: add x9, sp, #240
-; CHECK-NEXT: add x10, sp, #392
-; CHECK-NEXT: ldr b19, [sp, #672]
-; CHECK-NEXT: ldr b7, [sp, #16]
-; CHECK-NEXT: ld1 { v0.b }[4], [x8]
-; CHECK-NEXT: add x8, sp, #112
-; CHECK-NEXT: ldr b21, [sp, #1000]
-; CHECK-NEXT: ld1 { v2.b }[3], [x9]
-; CHECK-NEXT: ld1 { v1.b }[4], [x8]
-; CHECK-NEXT: add x8, sp, #416
-; CHECK-NEXT: ld1 { v4.b }[1], [x8]
-; CHECK-NEXT: add x8, sp, #120
-; CHECK-NEXT: add x9, sp, #400
-; CHECK-NEXT: ld1 { v0.b }[5], [x11]
-; CHECK-NEXT: add x11, sp, #128
-; CHECK-NEXT: ldr b22, [sp, #736]
-; CHECK-NEXT: ld1 { v2.b }[4], [x12]
-; CHECK-NEXT: add x12, sp, #424
-; CHECK-NEXT: ld1 { v1.b }[5], [x8]
-; CHECK-NEXT: ld1 { v4.b }[2], [x12]
-; CHECK-NEXT: add x12, sp, #152
-; CHECK-NEXT: add x8, sp, #136
-; CHECK-NEXT: ld1 { v5.b }[1], [x12]
-; CHECK-NEXT: add x12, sp, #440
-; CHECK-NEXT: ld1 { v0.b }[6], [x10]
-; CHECK-NEXT: ld1 { v1.b }[6], [x11]
-; CHECK-NEXT: add x11, sp, #288
-; CHECK-NEXT: add x10, sp, #256
-; CHECK-NEXT: ld1 { v4.b }[3], [x13]
-; CHECK-NEXT: ld1 { v6.b }[1], [x11]
-; CHECK-NEXT: add x11, sp, #296
-; CHECK-NEXT: ld1 { v0.b }[7], [x9]
-; CHECK-NEXT: add x9, sp, #160
-; CHECK-NEXT: ld1 { v2.b }[5], [x10]
-; CHECK-NEXT: ld1 { v5.b }[2], [x9]
-; CHECK-NEXT: add x10, sp, #168
-; CHECK-NEXT: ld1 { v1.b }[7], [x8]
-; CHECK-NEXT: ld1 { v4.b }[4], [x12]
-; CHECK-NEXT: add x12, sp, #448
-; CHECK-NEXT: ld1 { v6.b }[2], [x11]
-; CHECK-NEXT: add x11, sp, #304
-; CHECK-NEXT: add x8, sp, #464
-; CHECK-NEXT: add x13, sp, #768
-; CHECK-NEXT: ld1 { v5.b }[3], [x10]
-; CHECK-NEXT: add x10, sp, #176
-; CHECK-NEXT: add x9, sp, #264
-; CHECK-NEXT: ld1 { v4.b }[5], [x12]
-; CHECK-NEXT: add x12, sp, #456
-; CHECK-NEXT: ld1 { v6.b }[3], [x11]
-; CHECK-NEXT: add x11, sp, #760
-; CHECK-NEXT: ld1 { v2.b }[6], [x9]
-; CHECK-NEXT: add x9, sp, #272
-; CHECK-NEXT: ld1 { v5.b }[4], [x10]
-; CHECK-NEXT: add x10, sp, #312
-; CHECK-NEXT: fmov s3, w0
-; CHECK-NEXT: ld1 { v4.b }[6], [x12]
-; CHECK-NEXT: ld1 { v6.b }[4], [x10]
-; CHECK-NEXT: add x10, sp, #320
-; CHECK-NEXT: add x12, sp, #680
-; CHECK-NEXT: ld1 { v2.b }[7], [x9]
-; CHECK-NEXT: add x9, sp, #184
-; CHECK-NEXT: ld1 { v19.b }[1], [x12]
-; CHECK-NEXT: add x12, sp, #776
-; CHECK-NEXT: ld1 { v5.b }[5], [x9]
-; CHECK-NEXT: ld1 { v4.b }[7], [x8]
-; CHECK-NEXT: add x8, sp, #752
-; CHECK-NEXT: ld1 { v6.b }[5], [x10]
-; CHECK-NEXT: ld1 { v16.b }[1], [x8]
-; CHECK-NEXT: add x10, sp, #24
-; CHECK-NEXT: smull v22.8h, v22.8b, v21.8b
-; CHECK-NEXT: ld1 { v7.b }[1], [x10]
-; CHECK-NEXT: add x10, sp, #496
-; CHECK-NEXT: mov v3.b[1], w1
-; CHECK-NEXT: add x9, sp, #192
-; CHECK-NEXT: ldr b20, [sp, #472]
-; CHECK-NEXT: ldr b23, [sp, #208]
-; CHECK-NEXT: ld1 { v16.b }[2], [x11]
-; CHECK-NEXT: add x11, sp, #488
-; CHECK-NEXT: ld1 { v5.b }[6], [x9]
-; CHECK-NEXT: ld1 { v17.b }[1], [x11]
-; CHECK-NEXT: add x11, sp, #944
-; CHECK-NEXT: add x9, sp, #328
-; CHECK-NEXT: ld1 { v18.b }[1], [x11]
-; CHECK-NEXT: add x11, sp, #688
-; CHECK-NEXT: ld1 { v6.b }[6], [x9]
-; CHECK-NEXT: ld1 { v16.b }[3], [x13]
-; CHECK-NEXT: ld1 { v19.b }[2], [x11]
-; CHECK-NEXT: add x11, sp, #504
-; CHECK-NEXT: ld1 { v17.b }[2], [x10]
-; CHECK-NEXT: add x10, sp, #952
-; CHECK-NEXT: add x13, sp, #784
-; CHECK-NEXT: ld1 { v18.b }[2], [x10]
-; CHECK-NEXT: add x10, sp, #32
-; CHECK-NEXT: add x9, sp, #40
-; CHECK-NEXT: ld1 { v16.b }[4], [x12]
-; CHECK-NEXT: add x12, sp, #696
-; CHECK-NEXT: ld1 { v7.b }[2], [x10]
-; CHECK-NEXT: ld1 { v17.b }[3], [x11]
-; CHECK-NEXT: add x11, sp, #960
-; CHECK-NEXT: ld1 { v19.b }[3], [x12]
-; CHECK-NEXT: ld1 { v18.b }[3], [x11]
-; CHECK-NEXT: add x10, sp, #512
-; CHECK-NEXT: add x11, sp, #704
-; CHECK-NEXT: ld1 { v16.b }[5], [x13]
-; CHECK-NEXT: add x12, sp, #792
-; CHECK-NEXT: sshll v24.4s, v22.4h, #0
-; CHECK-NEXT: ld1 { v17.b }[4], [x10]
-; CHECK-NEXT: add x10, sp, #968
-; CHECK-NEXT: ld1 { v19.b }[4], [x11]
-; CHECK-NEXT: ld1 { v18.b }[4], [x10]
-; CHECK-NEXT: add x10, sp, #520
-; CHECK-NEXT: add x11, sp, #976
-; CHECK-NEXT: ld1 { v16.b }[6], [x12]
-; CHECK-NEXT: add x12, sp, #712
-; CHECK-NEXT: smull v20.8h, v23.8b, v20.8b
-; CHECK-NEXT: ld1 { v17.b }[5], [x10]
-; CHECK-NEXT: ld1 { v19.b }[5], [x12]
-; CHECK-NEXT: add x12, sp, #720
-; CHECK-NEXT: ld1 { v18.b }[5], [x11]
-; CHECK-NEXT: add x11, sp, #528
-; CHECK-NEXT: add x10, sp, #800
-; CHECK-NEXT: ld1 { v16.b }[7], [x10]
-; CHECK-NEXT: add x10, sp, #536
-; CHECK-NEXT: ldr b22, [sp, #872]
-; CHECK-NEXT: ld1 { v17.b }[6], [x11]
-; CHECK-NEXT: add x11, sp, #984
-; CHECK-NEXT: ld1 { v19.b }[6], [x12]
-; CHECK-NEXT: ld1 { v18.b }[6], [x11]
-; CHECK-NEXT: add x11, sp, #992
-; CHECK-NEXT: add x12, sp, #728
-; CHECK-NEXT: ldr b23, [sp, #608]
-; CHECK-NEXT: ld1 { v7.b }[3], [x9]
-; CHECK-NEXT: add x9, sp, #880
-; CHECK-NEXT: ld1 { v17.b }[7], [x10]
-; CHECK-NEXT: ld1 { v19.b }[7], [x12]
-; CHECK-NEXT: add x10, sp, #816
-; CHECK-NEXT: ld1 { v18.b }[7], [x11]
-; CHECK-NEXT: add x11, sp, #552
-; CHECK-NEXT: add x12, sp, #616
-; CHECK-NEXT: mov v3.b[2], w2
-; CHECK-NEXT: ld1 { v22.b }[1], [x9]
-; CHECK-NEXT: ld1 { v23.b }[1], [x12]
-; CHECK-NEXT: smull v16.8h, v17.8b, v16.8b
-; CHECK-NEXT: add x12, sp, #560
-; CHECK-NEXT: add x9, sp, #888
-; CHECK-NEXT: smull v17.8h, v19.8b, v18.8b
-; CHECK-NEXT: ldr b18, [sp, #808]
-; CHECK-NEXT: ldr b19, [sp, #544]
-; CHECK-NEXT: add x13, sp, #624
-; CHECK-NEXT: ld1 { v22.b }[2], [x9]
-; CHECK-NEXT: add x9, sp, #896
-; CHECK-NEXT: ld1 { v18.b }[1], [x10]
-; CHECK-NEXT: ld1 { v19.b }[1], [x11]
-; CHECK-NEXT: add x11, sp, #824
-; CHECK-NEXT: add x10, sp, #48
-; CHECK-NEXT: ld1 { v23.b }[2], [x13]
-; CHECK-NEXT: mov v3.b[3], w3
-; CHECK-NEXT: ld1 { v7.b }[4], [x10]
-; CHECK-NEXT: add x10, sp, #832
-; CHECK-NEXT: ld1 { v22.b }[3], [x9]
-; CHECK-NEXT: ld1 { v18.b }[2], [x11]
-; CHECK-NEXT: ld1 { v19.b }[2], [x12]
-; CHECK-NEXT: add x11, sp, #568
-; CHECK-NEXT: add x12, sp, #632
-; CHECK-NEXT: add x9, sp, #904
-; CHECK-NEXT: add x13, sp, #640
-; CHECK-NEXT: ld1 { v23.b }[3], [x12]
-; CHECK-NEXT: add x12, sp, #576
-; CHECK-NEXT: mov v3.b[4], w4
-; CHECK-NEXT: ld1 { v18.b }[3], [x10]
-; CHECK-NEXT: ld1 { v19.b }[3], [x11]
-; CHECK-NEXT: add x11, sp, #840
-; CHECK-NEXT: add x10, sp, #56
-; CHECK-NEXT: ld1 { v22.b }[4], [x9]
-; CHECK-NEXT: add x9, sp, #912
-; CHECK-NEXT: ld1 { v23.b }[4], [x13]
-; CHECK-NEXT: ld1 { v7.b }[5], [x10]
-; CHECK-NEXT: add x10, sp, #848
-; CHECK-NEXT: ld1 { v18.b }[4], [x11]
-; CHECK-NEXT: ld1 { v19.b }[4], [x12]
-; CHECK-NEXT: add x11, sp, #584
-; CHECK-NEXT: add x12, sp, #648
-; CHECK-NEXT: mov v3.b[5], w5
-; CHECK-NEXT: ld1 { v22.b }[5], [x9]
-; CHECK-NEXT: ld1 { v23.b }[5], [x12]
-; CHECK-NEXT: add x12, sp, #592
-; CHECK-NEXT: movi v21.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v18.b }[5], [x10]
-; CHECK-NEXT: ld1 { v19.b }[5], [x11]
-; CHECK-NEXT: add x11, sp, #856
-; CHECK-NEXT: add x9, sp, #920
-; CHECK-NEXT: add x13, sp, #656
-; CHECK-NEXT: add x10, sp, #64
-; CHECK-NEXT: ld1 { v22.b }[6], [x9]
-; CHECK-NEXT: ld1 { v23.b }[6], [x13]
-; CHECK-NEXT: mov v3.b[6], w6
-; CHECK-NEXT: ld1 { v18.b }[6], [x11]
-; CHECK-NEXT: ld1 { v19.b }[6], [x12]
-; CHECK-NEXT: ld1 { v7.b }[6], [x10]
-; CHECK-NEXT: add x10, sp, #864
-; CHECK-NEXT: add x11, sp, #600
-; CHECK-NEXT: add x9, sp, #928
-; CHECK-NEXT: add x12, sp, #664
-; CHECK-NEXT: mov v21.s[0], v24.s[0]
-; CHECK-NEXT: ld1 { v22.b }[7], [x9]
-; CHECK-NEXT: ld1 { v18.b }[7], [x10]
-; CHECK-NEXT: ld1 { v19.b }[7], [x11]
-; CHECK-NEXT: ld1 { v23.b }[7], [x12]
-; CHECK-NEXT: add x8, sp, #200
-; CHECK-NEXT: mov v3.b[7], w7
-; CHECK-NEXT: add x10, sp, #336
-; CHECK-NEXT: ld1 { v5.b }[7], [x8]
-; CHECK-NEXT: add x8, sp, #72
-; CHECK-NEXT: ld1 { v6.b }[7], [x10]
-; CHECK-NEXT: smull v18.8h, v19.8b, v18.8b
-; CHECK-NEXT: movi v19.2d, #0000000000000000
-; CHECK-NEXT: ld1 { v7.b }[7], [x8]
-; CHECK-NEXT: smull v22.8h, v23.8b, v22.8b
-; CHECK-NEXT: sshll v20.4s, v20.4h, #0
-; CHECK-NEXT: smull v0.8h, v1.8b, v0.8b
-; CHECK-NEXT: saddw v1.4s, v21.4s, v16.4h
-; CHECK-NEXT: smull v2.8h, v3.8b, v2.8b
-; CHECK-NEXT: smull v3.8h, v5.8b, v4.8b
-; CHECK-NEXT: smull v4.8h, v7.8b, v6.8b
-; CHECK-NEXT: mov v19.s[0], v20.s[0]
-; CHECK-NEXT: saddl2 v5.4s, v18.8h, v17.8h
-; CHECK-NEXT: saddl v7.4s, v18.4h, v17.4h
-; CHECK-NEXT: saddl2 v6.4s, v16.8h, v22.8h
-; CHECK-NEXT: saddw v1.4s, v1.4s, v22.4h
-; CHECK-NEXT: saddl2 v17.4s, v2.8h, v0.8h
-; CHECK-NEXT: saddl2 v16.4s, v4.8h, v3.8h
-; CHECK-NEXT: saddl v3.4s, v4.4h, v3.4h
-; CHECK-NEXT: saddw v2.4s, v19.4s, v2.4h
-; CHECK-NEXT: add v5.4s, v6.4s, v5.4s
-; CHECK-NEXT: add v1.4s, v1.4s, v7.4s
-; CHECK-NEXT: add v6.4s, v17.4s, v16.4s
-; CHECK-NEXT: saddw v0.4s, v2.4s, v0.4h
-; CHECK-NEXT: add v1.4s, v1.4s, v5.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v3.4s
-; CHECK-NEXT: add v1.4s, v6.4s, v1.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT: ret
entry:
%az = sext <33 x i8> %a to <33 x i32>
%bz = sext <33 x i8> %b to <33 x i32>
@@ -3418,163 +5328,399 @@ entry:
}
define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 x i8> %d) {
-; CHECK-LABEL: test_sdot_v33i8_double_nomla:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ldr b0, [sp, #80]
-; CHECK-NEXT: add x8, sp, #88
-; CHECK-NEXT: ldr b2, [sp, #144]
-; CHECK-NEXT: add x9, sp, #152
-; CHECK-NEXT: ldr b3, [sp, #16]
-; CHECK-NEXT: add x11, sp, #104
-; CHECK-NEXT: ld1 { v0.b }[1], [x8]
-; CHECK-NEXT: ld1 { v2.b }[1], [x9]
-; CHECK-NEXT: add x9, sp, #24
-; CHECK-NEXT: add x8, sp, #96
-; CHECK-NEXT: ld1 { v3.b }[1], [x9]
-; CHECK-NEXT: ldr b5, [sp, #480]
-; CHECK-NEXT: fmov s1, w0
-; CHECK-NEXT: add x10, sp, #112
-; CHECK-NEXT: add x12, sp, #168
-; CHECK-NEXT: ld1 { v0.b }[2], [x8]
-; CHECK-NEXT: add x8, sp, #160
-; CHECK-NEXT: ldr b4, [sp, #608]
-; CHECK-NEXT: ld1 { v2.b }[2], [x8]
-; CHECK-NEXT: add x8, sp, #32
-; CHECK-NEXT: add x13, sp, #496
-; CHECK-NEXT: ld1 { v3.b }[2], [x8]
-; CHECK-NEXT: mov v1.b[1], w1
-; CHECK-NEXT: ldr b6, [sp, #672]
-; CHECK-NEXT: ld1 { v0.b }[3], [x11]
-; CHECK-NEXT: add x11, sp, #488
-; CHECK-NEXT: add x9, sp, #120
-; CHECK-NEXT: ld1 { v5.b }[1], [x11]
-; CHECK-NEXT: add x11, sp, #40
-; CHECK-NEXT: ld1 { v2.b }[3], [x12]
-; CHECK-NEXT: ld1 { v3.b }[3], [x11]
-; CHECK-NEXT: add x12, sp, #616
-; CHECK-NEXT: ldr b16, [sp, #544]
-; CHECK-NEXT: ld1 { v0.b }[4], [x10]
-; CHECK-NEXT: add x10, sp, #48
-; CHECK-NEXT: ld1 { v4.b }[1], [x12]
-; CHECK-NEXT: add x12, sp, #176
-; CHECK-NEXT: ld1 { v5.b }[2], [x13]
-; CHECK-NEXT: add x13, sp, #680
-; CHECK-NEXT: ld1 { v3.b }[4], [x10]
-; CHECK-NEXT: ld1 { v2.b }[4], [x12]
-; CHECK-NEXT: ld1 { v6.b }[1], [x13]
-; CHECK-NEXT: add x13, sp, #56
-; CHECK-NEXT: ld1 { v0.b }[5], [x9]
-; CHECK-NEXT: mov v1.b[2], w2
-; CHECK-NEXT: add x8, sp, #128
-; CHECK-NEXT: add x14, sp, #184
-; CHECK-NEXT: add x11, sp, #136
-; CHECK-NEXT: ld1 { v3.b }[5], [x13]
-; CHECK-NEXT: add x13, sp, #552
-; CHECK-NEXT: ld1 { v2.b }[5], [x14]
-; CHECK-NEXT: ld1 { v16.b }[1], [x13]
-; CHECK-NEXT: add x14, sp, #624
-; CHECK-NEXT: ld1 { v0.b }[6], [x8]
-; CHECK-NEXT: add x8, sp, #688
-; CHECK-NEXT: add x13, sp, #504
-; CHECK-NEXT: ld1 { v4.b }[2], [x14]
-; CHECK-NEXT: ld1 { v6.b }[2], [x8]
-; CHECK-NEXT: add x8, sp, #560
-; CHECK-NEXT: ld1 { v5.b }[3], [x13]
-; CHECK-NEXT: ld1 { v16.b }[2], [x8]
-; CHECK-NEXT: mov v1.b[3], w3
-; CHECK-NEXT: add x9, sp, #64
-; CHECK-NEXT: add x15, sp, #632
-; CHECK-NEXT: ld1 { v3.b }[6], [x9]
-; CHECK-NEXT: ld1 { v0.b }[7], [x11]
-; CHECK-NEXT: ld1 { v4.b }[3], [x15]
-; CHECK-NEXT: add x8, sp, #696
-; CHECK-NEXT: add x9, sp, #568
-; CHECK-NEXT: add x11, sp, #512
-; CHECK-NEXT: ld1 { v6.b }[3], [x8]
-; CHECK-NEXT: ld1 { v16.b }[3], [x9]
-; CHECK-NEXT: ld1 { v5.b }[4], [x11]
-; CHECK-NEXT: add x8, sp, #640
-; CHECK-NEXT: mov v1.b[4], w4
-; CHECK-NEXT: ld1 { v4.b }[4], [x8]
-; CHECK-NEXT: add x8, sp, #704
-; CHECK-NEXT: add x9, sp, #576
-; CHECK-NEXT: add x11, sp, #520
-; CHECK-NEXT: ld1 { v6.b }[4], [x8]
-; CHECK-NEXT: ld1 { v16.b }[4], [x9]
-; CHECK-NEXT: ld1 { v5.b }[5], [x11]
-; CHECK-NEXT: ldr b18, [sp, #736]
-; CHECK-NEXT: add x12, sp, #192
-; CHECK-NEXT: ld1 { v2.b }[6], [x12]
-; CHECK-NEXT: add x8, sp, #648
-; CHECK-NEXT: add x9, sp, #528
-; CHECK-NEXT: add x11, sp, #712
-; CHECK-NEXT: add x12, sp, #584
-; CHECK-NEXT: sshll v18.8h, v18.8b, #0
-; CHECK-NEXT: mov v1.b[5], w5
-; CHECK-NEXT: ld1 { v6.b }[5], [x11]
-; CHECK-NEXT: ld1 { v16.b }[5], [x12]
-; CHECK-NEXT: ld1 { v4.b }[5], [x8]
-; CHECK-NEXT: ld1 { v5.b }[6], [x9]
-; CHECK-NEXT: movi v17.2d, #0000000000000000
-; CHECK-NEXT: add x8, sp, #656
-; CHECK-NEXT: add x9, sp, #536
-; CHECK-NEXT: add x11, sp, #720
-; CHECK-NEXT: add x12, sp, #592
-; CHECK-NEXT: sshll v18.4s, v18.4h, #0
-; CHECK-NEXT: ldr b7, [sp, #208]
-; CHECK-NEXT: ld1 { v6.b }[6], [x11]
-; CHECK-NEXT: ld1 { v16.b }[6], [x12]
-; CHECK-NEXT: ld1 { v4.b }[6], [x8]
-; CHECK-NEXT: ld1 { v5.b }[7], [x9]
-; CHECK-NEXT: mov v1.b[6], w6
-; CHECK-NEXT: sshll v7.8h, v7.8b, #0
-; CHECK-NEXT: add x8, sp, #664
-; CHECK-NEXT: add x9, sp, #728
-; CHECK-NEXT: add x11, sp, #600
-; CHECK-NEXT: mov v17.s[0], v18.s[0]
-; CHECK-NEXT: ld1 { v6.b }[7], [x9]
-; CHECK-NEXT: ld1 { v16.b }[7], [x11]
-; CHECK-NEXT: ld1 { v4.b }[7], [x8]
-; CHECK-NEXT: sshll v5.8h, v5.8b, #0
-; CHECK-NEXT: movi v18.2d, #0000000000000000
-; CHECK-NEXT: add x10, sp, #200
-; CHECK-NEXT: mov v1.b[7], w7
-; CHECK-NEXT: add x9, sp, #72
-; CHECK-NEXT: sshll v7.4s, v7.4h, #0
-; CHECK-NEXT: ld1 { v2.b }[7], [x10]
-; CHECK-NEXT: ld1 { v3.b }[7], [x9]
-; CHECK-NEXT: sshll v6.8h, v6.8b, #0
-; CHECK-NEXT: sshll v16.8h, v16.8b, #0
-; CHECK-NEXT: sshll v4.8h, v4.8b, #0
-; CHECK-NEXT: saddw v17.4s, v17.4s, v5.4h
-; CHECK-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: mov v18.s[0], v7.s[0]
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-NEXT: sshll v2.8h, v2.8b, #0
-; CHECK-NEXT: sshll v3.8h, v3.8b, #0
-; CHECK-NEXT: saddl2 v7.4s, v16.8h, v6.8h
-; CHECK-NEXT: saddl2 v5.4s, v5.8h, v4.8h
-; CHECK-NEXT: saddl v6.4s, v16.4h, v6.4h
-; CHECK-NEXT: saddw v4.4s, v17.4s, v4.4h
-; CHECK-NEXT: saddl2 v17.4s, v1.8h, v0.8h
-; CHECK-NEXT: saddl2 v16.4s, v3.8h, v2.8h
-; CHECK-NEXT: saddw v1.4s, v18.4s, v1.4h
-; CHECK-NEXT: add v5.4s, v5.4s, v7.4s
-; CHECK-NEXT: add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h
-; CHECK-NEXT: add v6.4s, v17.4s, v16.4s
-; CHECK-NEXT: saddw v0.4s, v1.4s, v0.4h
-; CHECK-NEXT: add v1.4s, v4.4s, v5.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT: add v1.4s, v6.4s, v1.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: addv s0, v0.4s
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_sdot_v33i8_double_nomla:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT: .cfi_offset w29, -16
+; CHECK-SD-NEXT: ldr b0, [sp, #80]
+; CHECK-SD-NEXT: add x8, sp, #88
+; CHECK-SD-NEXT: ldr b2, [sp, #144]
+; CHECK-SD-NEXT: add x9, sp, #152
+; CHECK-SD-NEXT: ldr b3, [sp, #16]
+; CHECK-SD-NEXT: add x11, sp, #104
+; CHECK-SD-NEXT: ld1 { v0.b }[1], [x8]
+; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9]
+; CHECK-SD-NEXT: add x9, sp, #24
+; CHECK-SD-NEXT: add x8, sp, #96
+; CHECK-SD-NEXT: ld1 { v3.b }[1], [x9]
+; CHECK-SD-NEXT: ldr b5, [sp, #480]
+; CHECK-SD-NEXT: fmov s1, w0
+; CHECK-SD-NEXT: add x10, sp, #112
+; CHECK-SD-NEXT: add x12, sp, #168
+; CHECK-SD-NEXT: ld1 { v0.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #160
+; CHECK-SD-NEXT: ldr b4, [sp, #608]
+; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #32
+; CHECK-SD-NEXT: add x13, sp, #496
+; CHECK-SD-NEXT: ld1 { v3.b }[2], [x8]
+; CHECK-SD-NEXT: mov v1.b[1], w1
+; CHECK-SD-NEXT: ldr b6, [sp, #672]
+; CHECK-SD-NEXT: ld1 { v0.b }[3], [x11]
+; CHECK-SD-NEXT: add x11, sp, #488
+; CHECK-SD-NEXT: add x9, sp, #120
+; CHECK-SD-NEXT: ld1 { v5.b }[1], [x11]
+; CHECK-SD-NEXT: add x11, sp, #40
+; CHECK-SD-NEXT: ld1 { v2.b }[3], [x12]
+; CHECK-SD-NEXT: ld1 { v3.b }[3], [x11]
+; CHECK-SD-NEXT: add x12, sp, #616
+; CHECK-SD-NEXT: ldr b16, [sp, #544]
+; CHECK-SD-NEXT: ld1 { v0.b }[4], [x10]
+; CHECK-SD-NEXT: add x10, sp, #48
+; CHECK-SD-NEXT: ld1 { v4.b }[1], [x12]
+; CHECK-SD-NEXT: add x12, sp, #176
+; CHECK-SD-NEXT: ld1 { v5.b }[2], [x13]
+; CHECK-SD-NEXT: add x13, sp, #680
+; CHECK-SD-NEXT: ld1 { v3.b }[4], [x10]
+; CHECK-SD-NEXT: ld1 { v2.b }[4], [x12]
+; CHECK-SD-NEXT: ld1 { v6.b }[1], [x13]
+; CHECK-SD-NEXT: add x13, sp, #56
+; CHECK-SD-NEXT: ld1 { v0.b }[5], [x9]
+; CHECK-SD-NEXT: mov v1.b[2], w2
+; CHECK-SD-NEXT: add x8, sp, #128
+; CHECK-SD-NEXT: add x14, sp, #184
+; CHECK-SD-NEXT: add x11, sp, #136
+; CHECK-SD-NEXT: ld1 { v3.b }[5], [x13]
+; CHECK-SD-NEXT: add x13, sp, #552
+; CHECK-SD-NEXT: ld1 { v2.b }[5], [x14]
+; CHECK-SD-NEXT: ld1 { v16.b }[1], [x13]
+; CHECK-SD-NEXT: add x14, sp, #624
+; CHECK-SD-NEXT: ld1 { v0.b }[6], [x8]
+; CHECK-SD-NEXT: add x8, sp, #688
+; CHECK-SD-NEXT: add x13, sp, #504
+; CHECK-SD-NEXT: ld1 { v4.b }[2], [x14]
+; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8]
+; CHECK-SD-NEXT: add x8, sp, #560
+; CHECK-SD-NEXT: ld1 { v5.b }[3], [x13]
+; CHECK-SD-NEXT: ld1 { v16.b }[2], [x8]
+; CHECK-SD-NEXT: mov v1.b[3], w3
+; CHECK-SD-NEXT: add x9, sp, #64
+; CHECK-SD-NEXT: add x15, sp, #632
+; CHECK-SD-NEXT: ld1 { v3.b }[6], [x9]
+; CHECK-SD-NEXT: ld1 { v0.b }[7], [x11]
+; CHECK-SD-NEXT: ld1 { v4.b }[3], [x15]
+; CHECK-SD-NEXT: add x8, sp, #696
+; CHECK-SD-NEXT: add x9, sp, #568
+; CHECK-SD-NEXT: add x11, sp, #512
+; CHECK-SD-NEXT: ld1 { v6.b }[3], [x8]
+; CHECK-SD-NEXT: ld1 { v16.b }[3], [x9]
+; CHECK-SD-NEXT: ld1 { v5.b }[4], [x11]
+; CHECK-SD-NEXT: add x8, sp, #640
+; CHECK-SD-NEXT: mov v1.b[4], w4
+; CHECK-SD-NEXT: ld1 { v4.b }[4], [x8]
+; CHECK-SD-NEXT: add x8, sp, #704
+; CHECK-SD-NEXT: add x9, sp, #576
+; CHECK-SD-NEXT: add x11, sp, #520
+; CHECK-SD-NEXT: ld1 { v6.b }[4], [x8]
+; CHECK-SD-NEXT: ld1 { v16.b }[4], [x9]
+; CHECK-SD-NEXT: ld1 { v5.b }[5], [x11]
+; CHECK-SD-NEXT: ldr b18, [sp, #736]
+; CHECK-SD-NEXT: add x12, sp, #192
+; CHECK-SD-NEXT: ld1 { v2.b }[6], [x12]
+; CHECK-SD-NEXT: add x8, sp, #648
+; CHECK-SD-NEXT: add x9, sp, #528
+; CHECK-SD-NEXT: add x11, sp, #712
+; CHECK-SD-NEXT: add x12, sp, #584
+; CHECK-SD-NEXT: sshll v18.8h, v18.8b, #0
+; CHECK-SD-NEXT: mov v1.b[5], w5
+; CHECK-SD-NEXT: ld1 { v6.b }[5], [x11]
+; CHECK-SD-NEXT: ld1 { v16.b }[5], [x12]
+; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8]
+; CHECK-SD-NEXT: ld1 { v5.b }[6], [x9]
+; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
+; CHECK-SD-NEXT: add x8, sp, #656
+; CHECK-SD-NEXT: add x9, sp, #536
+; CHECK-SD-NEXT: add x11, sp, #720
+; CHECK-SD-NEXT: add x12, sp, #592
+; CHECK-SD-NEXT: sshll v18.4s, v18.4h, #0
+; CHECK-SD-NEXT: ldr b7, [sp, #208]
+; CHECK-SD-NEXT: ld1 { v6.b }[6], [x11]
+; CHECK-SD-NEXT: ld1 { v16.b }[6], [x12]
+; CHECK-SD-NEXT: ld1 { v4.b }[6], [x8]
+; CHECK-SD-NEXT: ld1 { v5.b }[7], [x9]
+; CHECK-SD-NEXT: mov v1.b[6], w6
+; CHECK-SD-NEXT: sshll v7.8h, v7.8b, #0
+; CHECK-SD-NEXT: add x8, sp, #664
+; CHECK-SD-NEXT: add x9, sp, #728
+; CHECK-SD-NEXT: add x11, sp, #600
+; CHECK-SD-NEXT: mov v17.s[0], v18.s[0]
+; CHECK-SD-NEXT: ld1 { v6.b }[7], [x9]
+; CHECK-SD-NEXT: ld1 { v16.b }[7], [x11]
+; CHECK-SD-NEXT: ld1 { v4.b }[7], [x8]
+; CHECK-SD-NEXT: sshll v5.8h, v5.8b, #0
+; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
+; CHECK-SD-NEXT: add x10, sp, #200
+; CHECK-SD-NEXT: mov v1.b[7], w7
+; CHECK-SD-NEXT: add x9, sp, #72
+; CHECK-SD-NEXT: sshll v7.4s, v7.4h, #0
+; CHECK-SD-NEXT: ld1 { v2.b }[7], [x10]
+; CHECK-SD-NEXT: ld1 { v3.b }[7], [x9]
+; CHECK-SD-NEXT: sshll v6.8h, v6.8b, #0
+; CHECK-SD-NEXT: sshll v16.8h, v16.8b, #0
+; CHECK-SD-NEXT: sshll v4.8h, v4.8b, #0
+; CHECK-SD-NEXT: saddw v17.4s, v17.4s, v5.4h
+; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: mov v18.s[0], v7.s[0]
+; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: sshll v2.8h, v2.8b, #0
+; CHECK-SD-NEXT: sshll v3.8h, v3.8b, #0
+; CHECK-SD-NEXT: saddl2 v7.4s, v16.8h, v6.8h
+; CHECK-SD-NEXT: saddl2 v5.4s, v5.8h, v4.8h
+; CHECK-SD-NEXT: saddl v6.4s, v16.4h, v6.4h
+; CHECK-SD-NEXT: saddw v4.4s, v17.4s, v4.4h
+; CHECK-SD-NEXT: saddl2 v17.4s, v1.8h, v0.8h
+; CHECK-SD-NEXT: saddl2 v16.4s, v3.8h, v2.8h
+; CHECK-SD-NEXT: saddw v1.4s, v18.4s, v1.4h
+; CHECK-SD-NEXT: add v5.4s, v5.4s, v7.4s
+; CHECK-SD-NEXT: add v4.4s, v4.4s, v6.4s
+; CHECK-SD-NEXT: saddl v2.4s, v3.4h, v2.4h
+; CHECK-SD-NEXT: add v6.4s, v17.4s, v16.4s
+; CHECK-SD-NEXT: saddw v0.4s, v1.4s, v0.4h
+; CHECK-SD-NEXT: add v1.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT: add v1.4s, v6.4s, v1.4s
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_sdot_v33i8_double_nomla:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: sxtb w10, w0
+; CHECK-GI-NEXT: ldr w9, [sp, #16]
+; CHECK-GI-NEXT: ldr w8, [sp, #48]
+; CHECK-GI-NEXT: sxtb w11, w4
+; CHECK-GI-NEXT: sxtb w12, w6
+; CHECK-GI-NEXT: ldr w13, [sp, #592]
+; CHECK-GI-NEXT: mov v0.s[0], w10
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v1.s[0], w11
+; CHECK-GI-NEXT: sxtb w10, w1
+; CHECK-GI-NEXT: sxtb w11, w5
+; CHECK-GI-NEXT: mov v2.s[0], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #24]
+; CHECK-GI-NEXT: mov v3.s[0], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #56]
+; CHECK-GI-NEXT: mov v22.s[0], wzr
+; CHECK-GI-NEXT: mov v0.s[1], w10
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr w10, [sp, #80]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v1.s[1], w11
+; CHECK-GI-NEXT: sxtb w11, w2
+; CHECK-GI-NEXT: mov v2.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #32]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v3.s[1], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #64]
+; CHECK-GI-NEXT: mov v22.s[1], wzr
+; CHECK-GI-NEXT: mov v0.s[2], w11
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: sxtb w11, w3
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v1.s[2], w12
+; CHECK-GI-NEXT: mov v4.s[0], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #40]
+; CHECK-GI-NEXT: mov v2.s[2], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #88]
+; CHECK-GI-NEXT: sxtb w12, w7
+; CHECK-GI-NEXT: mov v3.s[2], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #72]
+; CHECK-GI-NEXT: mov v0.s[3], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #112]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v1.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #144]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v2.s[3], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #96]
+; CHECK-GI-NEXT: mov v4.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #176]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v3.s[3], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #120]
+; CHECK-GI-NEXT: mov v5.s[0], w11
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: ldr w11, [sp, #152]
+; CHECK-GI-NEXT: mov v6.s[0], w12
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: ldr w12, [sp, #104]
+; CHECK-GI-NEXT: mov v4.s[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #184]
+; CHECK-GI-NEXT: mov v7.s[0], w9
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: ldr w9, [sp, #128]
+; CHECK-GI-NEXT: mov v5.s[1], w8
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w8, [sp, #160]
+; CHECK-GI-NEXT: mov v6.s[1], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #480]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v4.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #192]
+; CHECK-GI-NEXT: mov v7.s[1], w10
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: ldr w10, [sp, #136]
+; CHECK-GI-NEXT: mov v5.s[2], w9
+; CHECK-GI-NEXT: sxtb w9, w11
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: ldr w11, [sp, #168]
+; CHECK-GI-NEXT: mov v6.s[2], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #488]
+; CHECK-GI-NEXT: mov v16.s[0], w9
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: ldr w9, [sp, #200]
+; CHECK-GI-NEXT: mov v7.s[2], w12
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: ldr w12, [sp, #512]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v5.s[3], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #544]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v6.s[3], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #496]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v16.s[1], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #208]
+; CHECK-GI-NEXT: mov v7.s[3], w9
+; CHECK-GI-NEXT: sxtb w9, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #520]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v17.s[0], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #552]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v19.s[0], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #504]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v16.s[2], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #576]
+; CHECK-GI-NEXT: mov v18.s[0], w8
+; CHECK-GI-NEXT: sxtb w8, w12
+; CHECK-GI-NEXT: ldr w12, [sp, #528]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v17.s[1], w10
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: ldr w10, [sp, #560]
+; CHECK-GI-NEXT: mov v19.s[1], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #608]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v16.s[3], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #584]
+; CHECK-GI-NEXT: mov v20.s[0], w11
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: ldr w11, [sp, #536]
+; CHECK-GI-NEXT: mov v17.s[2], w12
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v19.s[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #616]
+; CHECK-GI-NEXT: mov v21.s[0], w8
+; CHECK-GI-NEXT: ldr w12, [sp, #568]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v22.s[2], wzr
+; CHECK-GI-NEXT: mov v20.s[1], w9
+; CHECK-GI-NEXT: sxtb w9, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #640]
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v17.s[3], w11
+; CHECK-GI-NEXT: sxtb w11, w13
+; CHECK-GI-NEXT: mov v21.s[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #672]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v19.s[3], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #624]
+; CHECK-GI-NEXT: ldr w8, [sp, #600]
+; CHECK-GI-NEXT: mov v20.s[2], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #704]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v23.s[0], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #648]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v24.s[0], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #736]
+; CHECK-GI-NEXT: mov v21.s[2], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #680]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v25.s[0], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #712]
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v23.s[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #656]
+; CHECK-GI-NEXT: sxtb w11, w11
+; CHECK-GI-NEXT: mov v26.s[0], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #720]
+; CHECK-GI-NEXT: mov v24.s[1], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #688]
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v25.s[1], w11
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v18.s[1], wzr
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: ldr w11, [sp, #632]
+; CHECK-GI-NEXT: mov v23.s[2], w10
+; CHECK-GI-NEXT: mov v26.s[1], wzr
+; CHECK-GI-NEXT: ldr w10, [sp, #664]
+; CHECK-GI-NEXT: sxtb w8, w8
+; CHECK-GI-NEXT: mov v24.s[2], w12
+; CHECK-GI-NEXT: ldr w12, [sp, #696]
+; CHECK-GI-NEXT: mov v22.s[3], wzr
+; CHECK-GI-NEXT: mov v25.s[2], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #728]
+; CHECK-GI-NEXT: mov v20.s[3], w8
+; CHECK-GI-NEXT: sxtb w8, w11
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: sxtb w11, w12
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov v18.s[2], wzr
+; CHECK-GI-NEXT: mov v26.s[2], wzr
+; CHECK-GI-NEXT: mov v21.s[3], w8
+; CHECK-GI-NEXT: mov v23.s[3], w10
+; CHECK-GI-NEXT: mov v24.s[3], w11
+; CHECK-GI-NEXT: mov v25.s[3], w9
+; CHECK-GI-NEXT: add v27.4s, v22.4s, v22.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: add v3.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT: mov v18.s[3], wzr
+; CHECK-GI-NEXT: mov v26.s[3], wzr
+; CHECK-GI-NEXT: add v4.4s, v16.4s, v17.4s
+; CHECK-GI-NEXT: add v5.4s, v22.4s, v27.4s
+; CHECK-GI-NEXT: add v6.4s, v19.4s, v20.4s
+; CHECK-GI-NEXT: add v7.4s, v21.4s, v23.4s
+; CHECK-GI-NEXT: add v16.4s, v24.4s, v25.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v3.4s, v27.4s, v27.4s
+; CHECK-GI-NEXT: add v2.4s, v18.4s, v5.4s
+; CHECK-GI-NEXT: add v4.4s, v4.4s, v6.4s
+; CHECK-GI-NEXT: add v5.4s, v26.4s, v5.4s
+; CHECK-GI-NEXT: add v6.4s, v7.4s, v16.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v3.4s, v5.4s, v3.4s
+; CHECK-GI-NEXT: add v2.4s, v4.4s, v6.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: addv s1, v1.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-GI-NEXT: ret
entry:
%az = sext <33 x i8> %a to <33 x i32>
%r1 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %az)
@@ -3602,24 +5748,25 @@ define i32 @test_udot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b
;
; CHECK-GI-LABEL: test_udot_v48i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT: mov v0.s[0], wzr
; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-NEXT: ldr q7, [x0, #32]
-; CHECK-GI-NEXT: ldp q3, q4, [x0]
; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
-; CHECK-GI-NEXT: ldp q5, q6, [x1]
-; CHECK-GI-NEXT: ldr q16, [x1, #32]
-; CHECK-GI-NEXT: udot v0.4s, v5.16b, v3.16b
+; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT: ldr q17, [x1, #32]
+; CHECK-GI-NEXT: ldp q4, q5, [x0]
+; CHECK-GI-NEXT: ldp q6, q16, [x1]
+; CHECK-GI-NEXT: mov v0.s[1], wzr
+; CHECK-GI-NEXT: udot v2.4s, v17.16b, v7.16b
; CHECK-GI-NEXT: udot v1.4s, v6.16b, v4.16b
-; CHECK-GI-NEXT: udot v2.4s, v16.16b, v7.16b
+; CHECK-GI-NEXT: udot v3.4s, v16.16b, v5.16b
+; CHECK-GI-NEXT: mov v0.s[2], wzr
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[3], wzr
+; CHECK-GI-NEXT: add v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-NEXT: addv s0, v0.4s
-; CHECK-GI-NEXT: addv s1, v1.4s
-; CHECK-GI-NEXT: addv s2, v2.4s
; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: add w8, w8, w9
-; CHECK-GI-NEXT: fmov w9, s2
-; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: add w0, w8, w2
; CHECK-GI-NEXT: ret
entry:
@@ -3649,23 +5796,24 @@ define i32 @test_udot_v48i8_nomla(ptr nocapture readonly %a1) {
;
; CHECK-GI-LABEL: test_udot_v48i8_nomla:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: movi v0.16b, #1
-; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
-; CHECK-GI-NEXT: ldr q6, [x0, #32]
+; CHECK-GI-NEXT: mov v0.s[0], wzr
+; CHECK-GI-NEXT: movi v1.16b, #1
+; CHECK-GI-NEXT: ldr q7, [x0, #32]
; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
-; CHECK-GI-NEXT: ldp q4, q5, [x0]
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
-; CHECK-GI-NEXT: udot v1.4s, v4.16b, v0.16b
-; CHECK-GI-NEXT: udot v2.4s, v5.16b, v0.16b
-; CHECK-GI-NEXT: udot v3.4s, v6.16b, v0.16b
-; CHECK-GI-NEXT: addv s0, v1.4s
-; CHECK-GI-NEXT: addv s1, v2.4s
-; CHECK-GI-NEXT: addv s2, v3.4s
-; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: add w8, w8, w9
-; CHECK-GI-NEXT: fmov w9, s2
-; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
+; CHECK-GI-NEXT: ldp q5, q6, [x0]
+; CHECK-GI-NEXT: mov v0.s[1], wzr
+; CHECK-GI-NEXT: udot v2.4s, v5.16b, v1.16b
+; CHECK-GI-NEXT: udot v4.4s, v6.16b, v1.16b
+; CHECK-GI-NEXT: udot v3.4s, v7.16b, v1.16b
+; CHECK-GI-NEXT: mov v0.s[2], wzr
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT: mov v0.s[3], wzr
+; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
entry:
%0 = load <48 x i8>, ptr %a1
@@ -3691,24 +5839,25 @@ define i32 @test_sdot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b
;
; CHECK-GI-LABEL: test_sdot_v48i8:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT: mov v0.s[0], wzr
; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-NEXT: ldr q7, [x0, #32]
-; CHECK-GI-NEXT: ldp q3, q4, [x0]
; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
-; CHECK-GI-NEXT: ldp q5, q6, [x1]
-; CHECK-GI-NEXT: ldr q16, [x1, #32]
-; CHECK-GI-NEXT: sdot v0.4s, v5.16b, v3.16b
+; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT: ldr q17, [x1, #32]
+; CHECK-GI-NEXT: ldp q4, q5, [x0]
+; CHECK-GI-NEXT: ldp q6, q16, [x1]
+; CHECK-GI-NEXT: mov v0.s[1], wzr
+; CHECK-GI-NEXT: sdot v2.4s, v17.16b, v7.16b
; CHECK-GI-NEXT: sdot v1.4s, v6.16b, v4.16b
-; CHECK-GI-NEXT: sdot v2.4s, v16.16b, v7.16b
+; CHECK-GI-NEXT: sdot v3.4s, v16.16b, v5.16b
+; CHECK-GI-NEXT: mov v0.s[2], wzr
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT: mov v0.s[3], wzr
+; CHECK-GI-NEXT: add v0.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-NEXT: addv s0, v0.4s
-; CHECK-GI-NEXT: addv s1, v1.4s
-; CHECK-GI-NEXT: addv s2, v2.4s
; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: add w8, w8, w9
-; CHECK-GI-NEXT: fmov w9, s2
-; CHECK-GI-NEXT: add w8, w8, w9
; CHECK-GI-NEXT: add w0, w8, w2
; CHECK-GI-NEXT: ret
entry:
@@ -4113,409 +6262,407 @@ define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
; CHECK-GI-NEXT: .cfi_offset w29, -16
; CHECK-GI-NEXT: ldr w11, [sp, #80]
-; CHECK-GI-NEXT: ldr w10, [sp, #208]
+; CHECK-GI-NEXT: ldr w8, [sp, #208]
; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: ldr w8, [sp, #88]
-; CHECK-GI-NEXT: ldr w12, [sp, #344]
-; CHECK-GI-NEXT: movi v20.2d, #0000000000000000
+; CHECK-GI-NEXT: ldr w12, [sp, #336]
+; CHECK-GI-NEXT: ldr w9, [sp, #88]
+; CHECK-GI-NEXT: mov v20.s[0], wzr
; CHECK-GI-NEXT: fmov s1, w11
-; CHECK-GI-NEXT: ldr w11, [sp, #336]
-; CHECK-GI-NEXT: fmov s2, w10
-; CHECK-GI-NEXT: ldr w10, [sp, #464]
-; CHECK-GI-NEXT: ldr w9, [sp, #216]
+; CHECK-GI-NEXT: fmov s2, w8
+; CHECK-GI-NEXT: ldr w11, [sp, #464]
+; CHECK-GI-NEXT: ldr w8, [sp, #592]
+; CHECK-GI-NEXT: ldr w10, [sp, #216]
+; CHECK-GI-NEXT: fmov s3, w12
+; CHECK-GI-NEXT: fmov s4, w11
; CHECK-GI-NEXT: mov v0.b[1], w1
-; CHECK-GI-NEXT: fmov s3, w11
; CHECK-GI-NEXT: ldr w11, [sp, #600]
-; CHECK-GI-NEXT: movi v21.2d, #0000000000000000
-; CHECK-GI-NEXT: mov v1.b[1], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #592]
-; CHECK-GI-NEXT: fmov s4, w10
-; CHECK-GI-NEXT: mov v2.b[1], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #472]
-; CHECK-GI-NEXT: ldr w10, [sp, #608]
-; CHECK-GI-NEXT: mov v3.b[1], w12
; CHECK-GI-NEXT: fmov s5, w8
+; CHECK-GI-NEXT: mov v1.b[1], w9
+; CHECK-GI-NEXT: mov v2.b[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #344]
+; CHECK-GI-NEXT: ldr w9, [sp, #472]
; CHECK-GI-NEXT: ldr w8, [sp, #96]
+; CHECK-GI-NEXT: ldr w12, [sp, #848]
+; CHECK-GI-NEXT: ldr w13, [sp, #728]
+; CHECK-GI-NEXT: mov v20.s[1], wzr
+; CHECK-GI-NEXT: mov v3.b[1], w10
; CHECK-GI-NEXT: mov v4.b[1], w9
+; CHECK-GI-NEXT: mov v5.b[1], w11
; CHECK-GI-NEXT: ldr w9, [sp, #224]
; CHECK-GI-NEXT: mov v0.b[2], w2
+; CHECK-GI-NEXT: ldr w10, [sp, #352]
+; CHECK-GI-NEXT: ldr w11, [sp, #480]
; CHECK-GI-NEXT: mov v1.b[2], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #352]
-; CHECK-GI-NEXT: ldr w12, [sp, #848]
+; CHECK-GI-NEXT: ldr w8, [sp, #608]
; CHECK-GI-NEXT: mov v2.b[2], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #480]
-; CHECK-GI-NEXT: mov v5.b[1], w11
-; CHECK-GI-NEXT: mov v3.b[2], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #104]
-; CHECK-GI-NEXT: ldr w11, [sp, #16]
-; CHECK-GI-NEXT: mov v4.b[2], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #232]
+; CHECK-GI-NEXT: ldr w9, [sp, #104]
+; CHECK-GI-NEXT: fmov s7, w12
+; CHECK-GI-NEXT: mov v3.b[2], w10
+; CHECK-GI-NEXT: mov v4.b[2], w11
+; CHECK-GI-NEXT: mov v5.b[2], w8
+; CHECK-GI-NEXT: ldr w10, [sp, #232]
; CHECK-GI-NEXT: mov v0.b[3], w3
-; CHECK-GI-NEXT: mov v1.b[3], w8
; CHECK-GI-NEXT: ldr w8, [sp, #360]
-; CHECK-GI-NEXT: fmov s7, w12
-; CHECK-GI-NEXT: mov v2.b[3], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #488]
-; CHECK-GI-NEXT: mov v5.b[2], w10
+; CHECK-GI-NEXT: ldr w11, [sp, #488]
+; CHECK-GI-NEXT: mov v1.b[3], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #616]
+; CHECK-GI-NEXT: mov v2.b[3], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #240]
+; CHECK-GI-NEXT: ldr w12, [sp, #1232]
; CHECK-GI-NEXT: mov v3.b[3], w8
+; CHECK-GI-NEXT: mov v4.b[3], w11
; CHECK-GI-NEXT: ldr w8, [sp, #112]
-; CHECK-GI-NEXT: ldr w10, [sp, #616]
-; CHECK-GI-NEXT: mov v4.b[3], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #240]
+; CHECK-GI-NEXT: mov v5.b[3], w9
; CHECK-GI-NEXT: mov v0.b[4], w4
+; CHECK-GI-NEXT: ldr w9, [sp, #368]
+; CHECK-GI-NEXT: ldr w11, [sp, #496]
; CHECK-GI-NEXT: mov v1.b[4], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #368]
-; CHECK-GI-NEXT: ldr w12, [sp, #1112]
-; CHECK-GI-NEXT: mov v2.b[4], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #496]
-; CHECK-GI-NEXT: mov v5.b[3], w10
-; CHECK-GI-NEXT: mov v3.b[4], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #120]
-; CHECK-GI-NEXT: ldr w10, [sp, #624]
-; CHECK-GI-NEXT: mov v4.b[4], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #248]
+; CHECK-GI-NEXT: ldr w8, [sp, #624]
+; CHECK-GI-NEXT: mov v2.b[4], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #248]
+; CHECK-GI-NEXT: fmov s18, w12
+; CHECK-GI-NEXT: mov v3.b[4], w9
+; CHECK-GI-NEXT: mov v4.b[4], w11
+; CHECK-GI-NEXT: ldr w9, [sp, #120]
+; CHECK-GI-NEXT: mov v5.b[4], w8
; CHECK-GI-NEXT: mov v0.b[5], w5
-; CHECK-GI-NEXT: mov v1.b[5], w8
; CHECK-GI-NEXT: ldr w8, [sp, #376]
-; CHECK-GI-NEXT: movi v22.2d, #0000000000000000
-; CHECK-GI-NEXT: mov v2.b[5], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #504]
-; CHECK-GI-NEXT: mov v5.b[4], w10
+; CHECK-GI-NEXT: ldr w11, [sp, #504]
+; CHECK-GI-NEXT: mov v1.b[5], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #632]
+; CHECK-GI-NEXT: mov v2.b[5], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #256]
+; CHECK-GI-NEXT: ldr w12, [sp, #744]
; CHECK-GI-NEXT: mov v3.b[5], w8
+; CHECK-GI-NEXT: mov v4.b[5], w11
; CHECK-GI-NEXT: ldr w8, [sp, #128]
-; CHECK-GI-NEXT: ldr w10, [sp, #632]
-; CHECK-GI-NEXT: mov v4.b[5], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #256]
+; CHECK-GI-NEXT: mov v5.b[5], w9
; CHECK-GI-NEXT: mov v0.b[6], w6
+; CHECK-GI-NEXT: ldr w9, [sp, #384]
+; CHECK-GI-NEXT: ldr w11, [sp, #512]
; CHECK-GI-NEXT: mov v1.b[6], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #384]
-; CHECK-GI-NEXT: movi v23.2d, #0000000000000000
-; CHECK-GI-NEXT: mov v2.b[6], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #512]
-; CHECK-GI-NEXT: mov v5.b[5], w10
-; CHECK-GI-NEXT: mov v3.b[6], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #136]
-; CHECK-GI-NEXT: ldr w10, [sp, #640]
-; CHECK-GI-NEXT: mov v4.b[6], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #264]
+; CHECK-GI-NEXT: ldr w8, [sp, #640]
+; CHECK-GI-NEXT: mov v2.b[6], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #264]
+; CHECK-GI-NEXT: movi v21.2d, #0000000000000000
+; CHECK-GI-NEXT: mov v3.b[6], w9
+; CHECK-GI-NEXT: mov v4.b[6], w11
+; CHECK-GI-NEXT: ldr w9, [sp, #136]
+; CHECK-GI-NEXT: mov v5.b[6], w8
; CHECK-GI-NEXT: mov v0.b[7], w7
-; CHECK-GI-NEXT: mov v1.b[7], w8
; CHECK-GI-NEXT: ldr w8, [sp, #392]
-; CHECK-GI-NEXT: movi v24.2d, #0000000000000000
-; CHECK-GI-NEXT: mov v2.b[7], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #520]
-; CHECK-GI-NEXT: mov v5.b[6], w10
+; CHECK-GI-NEXT: ldr w11, [sp, #520]
+; CHECK-GI-NEXT: mov v1.b[7], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #648]
+; CHECK-GI-NEXT: mov v2.b[7], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #16]
+; CHECK-GI-NEXT: movi v22.2d, #0000000000000000
; CHECK-GI-NEXT: mov v3.b[7], w8
+; CHECK-GI-NEXT: mov v4.b[7], w11
; CHECK-GI-NEXT: ldr w8, [sp, #144]
-; CHECK-GI-NEXT: ldr w10, [sp, #648]
-; CHECK-GI-NEXT: mov v4.b[7], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #272]
-; CHECK-GI-NEXT: mov v0.b[8], w11
+; CHECK-GI-NEXT: mov v5.b[7], w9
+; CHECK-GI-NEXT: ldr w11, [sp, #272]
+; CHECK-GI-NEXT: mov v0.b[8], w10
+; CHECK-GI-NEXT: ldr w9, [sp, #400]
+; CHECK-GI-NEXT: ldr w10, [sp, #528]
; CHECK-GI-NEXT: mov v1.b[8], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #400]
+; CHECK-GI-NEXT: ldr w8, [sp, #656]
+; CHECK-GI-NEXT: mov v2.b[8], w11
; CHECK-GI-NEXT: ldr w11, [sp, #24]
-; CHECK-GI-NEXT: mov v2.b[8], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #528]
-; CHECK-GI-NEXT: mov v5.b[7], w10
-; CHECK-GI-NEXT: mov v3.b[8], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #152]
-; CHECK-GI-NEXT: ldr w10, [sp, #656]
-; CHECK-GI-NEXT: mov v4.b[8], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #280]
+; CHECK-GI-NEXT: mov v3.b[8], w9
+; CHECK-GI-NEXT: mov v4.b[8], w10
+; CHECK-GI-NEXT: ldr w9, [sp, #152]
+; CHECK-GI-NEXT: mov v5.b[8], w8
+; CHECK-GI-NEXT: ldr w10, [sp, #280]
; CHECK-GI-NEXT: mov v0.b[9], w11
-; CHECK-GI-NEXT: mov v1.b[9], w8
; CHECK-GI-NEXT: ldr w8, [sp, #408]
-; CHECK-GI-NEXT: ldr w11, [sp, #32]
-; CHECK-GI-NEXT: mov v2.b[9], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #536]
-; CHECK-GI-NEXT: mov v5.b[8], w10
+; CHECK-GI-NEXT: ldr w11, [sp, #536]
+; CHECK-GI-NEXT: mov v1.b[9], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #664]
+; CHECK-GI-NEXT: mov v2.b[9], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #32]
; CHECK-GI-NEXT: mov v3.b[9], w8
+; CHECK-GI-NEXT: mov v4.b[9], w11
; CHECK-GI-NEXT: ldr w8, [sp, #160]
-; CHECK-GI-NEXT: ldr w10, [sp, #664]
-; CHECK-GI-NEXT: mov v4.b[9], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #288]
-; CHECK-GI-NEXT: mov v0.b[10], w11
+; CHECK-GI-NEXT: mov v5.b[9], w9
+; CHECK-GI-NEXT: ldr w11, [sp, #288]
+; CHECK-GI-NEXT: mov v0.b[10], w10
+; CHECK-GI-NEXT: ldr w9, [sp, #416]
+; CHECK-GI-NEXT: ldr w10, [sp, #544]
; CHECK-GI-NEXT: mov v1.b[10], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #416]
+; CHECK-GI-NEXT: ldr w8, [sp, #672]
+; CHECK-GI-NEXT: mov v2.b[10], w11
; CHECK-GI-NEXT: ldr w11, [sp, #40]
-; CHECK-GI-NEXT: mov v2.b[10], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #544]
-; CHECK-GI-NEXT: mov v5.b[9], w10
-; CHECK-GI-NEXT: mov v3.b[10], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #168]
-; CHECK-GI-NEXT: ldr w10, [sp, #672]
-; CHECK-GI-NEXT: mov v4.b[10], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #296]
+; CHECK-GI-NEXT: mov v3.b[10], w9
+; CHECK-GI-NEXT: mov v4.b[10], w10
+; CHECK-GI-NEXT: ldr w9, [sp, #168]
+; CHECK-GI-NEXT: mov v5.b[10], w8
+; CHECK-GI-NEXT: ldr w10, [sp, #296]
; CHECK-GI-NEXT: mov v0.b[11], w11
-; CHECK-GI-NEXT: mov v1.b[11], w8
; CHECK-GI-NEXT: ldr w8, [sp, #424]
-; CHECK-GI-NEXT: ldr w11, [sp, #48]
-; CHECK-GI-NEXT: mov v2.b[11], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #552]
-; CHECK-GI-NEXT: mov v5.b[10], w10
+; CHECK-GI-NEXT: ldr w11, [sp, #552]
+; CHECK-GI-NEXT: mov v1.b[11], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #680]
+; CHECK-GI-NEXT: mov v2.b[11], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #48]
; CHECK-GI-NEXT: mov v3.b[11], w8
+; CHECK-GI-NEXT: mov v4.b[11], w11
; CHECK-GI-NEXT: ldr w8, [sp, #176]
-; CHECK-GI-NEXT: ldr w10, [sp, #680]
-; CHECK-GI-NEXT: mov v4.b[11], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #304]
-; CHECK-GI-NEXT: mov v0.b[12], w11
+; CHECK-GI-NEXT: mov v5.b[11], w9
+; CHECK-GI-NEXT: ldr w11, [sp, #304]
+; CHECK-GI-NEXT: mov v0.b[12], w10
+; CHECK-GI-NEXT: ldr w9, [sp, #432]
+; CHECK-GI-NEXT: ldr w10, [sp, #560]
; CHECK-GI-NEXT: mov v1.b[12], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #432]
+; CHECK-GI-NEXT: ldr w8, [sp, #688]
+; CHECK-GI-NEXT: mov v2.b[12], w11
; CHECK-GI-NEXT: ldr w11, [sp, #56]
-; CHECK-GI-NEXT: mov v2.b[12], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #560]
-; CHECK-GI-NEXT: mov v5.b[11], w10
-; CHECK-GI-NEXT: mov v3.b[12], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #184]
-; CHECK-GI-NEXT: ldr w10, [sp, #688]
-; CHECK-GI-NEXT: mov v4.b[12], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #312]
+; CHECK-GI-NEXT: mov v3.b[12], w9
+; CHECK-GI-NEXT: mov v4.b[12], w10
+; CHECK-GI-NEXT: ldr w9, [sp, #184]
+; CHECK-GI-NEXT: mov v5.b[12], w8
+; CHECK-GI-NEXT: ldr w10, [sp, #312]
; CHECK-GI-NEXT: mov v0.b[13], w11
-; CHECK-GI-NEXT: mov v1.b[13], w8
; CHECK-GI-NEXT: ldr w8, [sp, #440]
-; CHECK-GI-NEXT: ldr w11, [sp, #64]
-; CHECK-GI-NEXT: mov v2.b[13], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #568]
-; CHECK-GI-NEXT: mov v5.b[12], w10
+; CHECK-GI-NEXT: ldr w11, [sp, #568]
+; CHECK-GI-NEXT: mov v1.b[13], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #696]
+; CHECK-GI-NEXT: mov v2.b[13], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #64]
; CHECK-GI-NEXT: mov v3.b[13], w8
+; CHECK-GI-NEXT: mov v4.b[13], w11
; CHECK-GI-NEXT: ldr w8, [sp, #192]
-; CHECK-GI-NEXT: ldr w10, [sp, #696]
-; CHECK-GI-NEXT: mov v4.b[13], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #320]
-; CHECK-GI-NEXT: mov v0.b[14], w11
+; CHECK-GI-NEXT: mov v5.b[13], w9
+; CHECK-GI-NEXT: ldr w11, [sp, #320]
+; CHECK-GI-NEXT: mov v0.b[14], w10
+; CHECK-GI-NEXT: ldr w9, [sp, #448]
+; CHECK-GI-NEXT: ldr w10, [sp, #576]
; CHECK-GI-NEXT: mov v1.b[14], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #448]
+; CHECK-GI-NEXT: ldr w8, [sp, #704]
+; CHECK-GI-NEXT: mov v2.b[14], w11
; CHECK-GI-NEXT: ldr w11, [sp, #72]
-; CHECK-GI-NEXT: mov v2.b[14], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #576]
-; CHECK-GI-NEXT: mov v5.b[13], w10
-; CHECK-GI-NEXT: mov v3.b[14], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #720]
-; CHECK-GI-NEXT: ldr w10, [sp, #704]
-; CHECK-GI-NEXT: mov v4.b[14], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #728]
+; CHECK-GI-NEXT: mov v3.b[14], w9
+; CHECK-GI-NEXT: mov v4.b[14], w10
+; CHECK-GI-NEXT: ldr w9, [sp, #200]
+; CHECK-GI-NEXT: mov v5.b[14], w8
+; CHECK-GI-NEXT: ldr w10, [sp, #328]
; CHECK-GI-NEXT: mov v0.b[15], w11
-; CHECK-GI-NEXT: fmov s6, w8
-; CHECK-GI-NEXT: ldr w8, [sp, #328]
-; CHECK-GI-NEXT: ldr w11, [sp, #456]
-; CHECK-GI-NEXT: mov v5.b[14], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #200]
-; CHECK-GI-NEXT: movi v25.2d, #0000000000000000
-; CHECK-GI-NEXT: mov v2.b[15], w8
-; CHECK-GI-NEXT: mov v3.b[15], w11
-; CHECK-GI-NEXT: ldr w11, [sp, #736]
-; CHECK-GI-NEXT: mov v6.b[1], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #584]
-; CHECK-GI-NEXT: ldr w8, [sp, #856]
-; CHECK-GI-NEXT: mov v1.b[15], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #712]
-; CHECK-GI-NEXT: mov v4.b[15], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #976]
-; CHECK-GI-NEXT: mov v7.b[1], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1232]
-; CHECK-GI-NEXT: mov v5.b[15], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #984]
-; CHECK-GI-NEXT: mov v6.b[2], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #584]
+; CHECK-GI-NEXT: mov v1.b[15], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #712]
+; CHECK-GI-NEXT: mov v2.b[15], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #720]
+; CHECK-GI-NEXT: ldr w8, [sp, #456]
+; CHECK-GI-NEXT: mov v4.b[15], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #976]
+; CHECK-GI-NEXT: movi v23.2d, #0000000000000000
+; CHECK-GI-NEXT: mov v5.b[15], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #856]
+; CHECK-GI-NEXT: fmov s6, w10
+; CHECK-GI-NEXT: fmov s16, w11
; CHECK-GI-NEXT: ldr w11, [sp, #1104]
-; CHECK-GI-NEXT: fmov s16, w9
+; CHECK-GI-NEXT: ldr w10, [sp, #984]
+; CHECK-GI-NEXT: mov v7.b[1], w9
; CHECK-GI-NEXT: ldr w9, [sp, #1360]
-; CHECK-GI-NEXT: fmov s18, w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1368]
+; CHECK-GI-NEXT: mov v3.b[15], w8
; CHECK-GI-NEXT: fmov s17, w11
-; CHECK-GI-NEXT: ldr w11, [sp, #1240]
-; CHECK-GI-NEXT: sdot v20.4s, v0.16b, v3.16b
-; CHECK-GI-NEXT: mov v16.b[1], w10
+; CHECK-GI-NEXT: mov v6.b[1], w13
+; CHECK-GI-NEXT: ldr w13, [sp, #1112]
; CHECK-GI-NEXT: fmov s19, w9
-; CHECK-GI-NEXT: ldr w10, [sp, #864]
-; CHECK-GI-NEXT: mov v18.b[1], w11
-; CHECK-GI-NEXT: ldr w11, [sp, #992]
-; CHECK-GI-NEXT: ldr w9, [sp, #1120]
-; CHECK-GI-NEXT: mov v17.b[1], w12
-; CHECK-GI-NEXT: mov v7.b[2], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1248]
-; CHECK-GI-NEXT: mov v19.b[1], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #744]
-; CHECK-GI-NEXT: sdot v21.4s, v1.16b, v4.16b
-; CHECK-GI-NEXT: mov v16.b[2], w11
-; CHECK-GI-NEXT: ldr w11, [sp, #872]
-; CHECK-GI-NEXT: addv s0, v20.4s
-; CHECK-GI-NEXT: mov v6.b[3], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1000]
-; CHECK-GI-NEXT: mov v18.b[2], w10
-; CHECK-GI-NEXT: mov v17.b[2], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1376]
-; CHECK-GI-NEXT: ldr w10, [sp, #1128]
-; CHECK-GI-NEXT: mov v7.b[3], w11
-; CHECK-GI-NEXT: ldr w11, [sp, #880]
-; CHECK-GI-NEXT: addv s1, v21.4s
-; CHECK-GI-NEXT: mov v19.b[2], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #752]
-; CHECK-GI-NEXT: mov v16.b[3], w8
+; CHECK-GI-NEXT: mov v16.b[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #1240]
+; CHECK-GI-NEXT: ldr w11, [sp, #1368]
+; CHECK-GI-NEXT: ldr w8, [sp, #736]
+; CHECK-GI-NEXT: ldr w9, [sp, #864]
+; CHECK-GI-NEXT: mov v17.b[1], w13
+; CHECK-GI-NEXT: mov v18.b[1], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #992]
+; CHECK-GI-NEXT: mov v19.b[1], w11
+; CHECK-GI-NEXT: mov v6.b[2], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #1120]
+; CHECK-GI-NEXT: mov v7.b[2], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #1248]
+; CHECK-GI-NEXT: ldr w11, [sp, #1376]
+; CHECK-GI-NEXT: mov v16.b[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #752]
+; CHECK-GI-NEXT: mov v20.s[2], wzr
+; CHECK-GI-NEXT: mov v17.b[2], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #872]
+; CHECK-GI-NEXT: mov v18.b[2], w9
+; CHECK-GI-NEXT: mov v19.b[2], w11
+; CHECK-GI-NEXT: ldr w9, [sp, #1000]
+; CHECK-GI-NEXT: mov v6.b[3], w12
+; CHECK-GI-NEXT: ldr w11, [sp, #1128]
+; CHECK-GI-NEXT: mov v7.b[3], w8
; CHECK-GI-NEXT: ldr w8, [sp, #1256]
-; CHECK-GI-NEXT: sdot v25.4s, v2.16b, v5.16b
-; CHECK-GI-NEXT: mov v17.b[3], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1384]
-; CHECK-GI-NEXT: mov v6.b[4], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1008]
+; CHECK-GI-NEXT: ldr w12, [sp, #1384]
+; CHECK-GI-NEXT: mov v16.b[3], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #760]
+; CHECK-GI-NEXT: mov v17.b[3], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #880]
; CHECK-GI-NEXT: mov v18.b[3], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1136]
-; CHECK-GI-NEXT: mov v19.b[3], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #760]
+; CHECK-GI-NEXT: mov v19.b[3], w12
+; CHECK-GI-NEXT: ldr w8, [sp, #1008]
+; CHECK-GI-NEXT: mov v6.b[4], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #1136]
; CHECK-GI-NEXT: mov v7.b[4], w11
-; CHECK-GI-NEXT: mov v16.b[4], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1264]
-; CHECK-GI-NEXT: ldr w11, [sp, #888]
-; CHECK-GI-NEXT: mov v17.b[4], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1392]
-; CHECK-GI-NEXT: mov v6.b[5], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1016]
-; CHECK-GI-NEXT: mov v18.b[4], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1144]
-; CHECK-GI-NEXT: mov v19.b[4], w8
+; CHECK-GI-NEXT: ldr w11, [sp, #1264]
+; CHECK-GI-NEXT: ldr w12, [sp, #1392]
+; CHECK-GI-NEXT: mov v16.b[4], w8
; CHECK-GI-NEXT: ldr w8, [sp, #768]
-; CHECK-GI-NEXT: mov v7.b[5], w11
-; CHECK-GI-NEXT: mov v16.b[5], w10
+; CHECK-GI-NEXT: mov v17.b[4], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #888]
+; CHECK-GI-NEXT: mov v18.b[4], w11
+; CHECK-GI-NEXT: mov v19.b[4], w12
+; CHECK-GI-NEXT: ldr w11, [sp, #1016]
+; CHECK-GI-NEXT: mov v6.b[5], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #1144]
+; CHECK-GI-NEXT: mov v7.b[5], w10
; CHECK-GI-NEXT: ldr w10, [sp, #1272]
-; CHECK-GI-NEXT: ldr w11, [sp, #896]
+; CHECK-GI-NEXT: ldr w12, [sp, #1400]
+; CHECK-GI-NEXT: mov v16.b[5], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #776]
; CHECK-GI-NEXT: mov v17.b[5], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1400]
-; CHECK-GI-NEXT: mov v6.b[6], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1024]
+; CHECK-GI-NEXT: ldr w9, [sp, #896]
; CHECK-GI-NEXT: mov v18.b[5], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1152]
-; CHECK-GI-NEXT: mov v19.b[5], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #776]
-; CHECK-GI-NEXT: mov v7.b[6], w11
-; CHECK-GI-NEXT: mov v16.b[6], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1280]
-; CHECK-GI-NEXT: ldr w11, [sp, #904]
-; CHECK-GI-NEXT: mov v17.b[6], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1408]
-; CHECK-GI-NEXT: mov v6.b[7], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1032]
-; CHECK-GI-NEXT: mov v18.b[6], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1160]
-; CHECK-GI-NEXT: mov v19.b[6], w10
+; CHECK-GI-NEXT: mov v19.b[5], w12
+; CHECK-GI-NEXT: ldr w10, [sp, #1024]
+; CHECK-GI-NEXT: mov v6.b[6], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #1152]
+; CHECK-GI-NEXT: mov v7.b[6], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #1280]
+; CHECK-GI-NEXT: ldr w12, [sp, #1408]
+; CHECK-GI-NEXT: mov v16.b[6], w10
; CHECK-GI-NEXT: ldr w10, [sp, #784]
-; CHECK-GI-NEXT: mov v7.b[7], w11
+; CHECK-GI-NEXT: mov v17.b[6], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #904]
+; CHECK-GI-NEXT: mov v18.b[6], w9
+; CHECK-GI-NEXT: mov v19.b[6], w12
+; CHECK-GI-NEXT: ldr w9, [sp, #1032]
+; CHECK-GI-NEXT: mov v6.b[7], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #1160]
+; CHECK-GI-NEXT: mov v7.b[7], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #1288]
+; CHECK-GI-NEXT: ldr w12, [sp, #1416]
; CHECK-GI-NEXT: mov v16.b[7], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1288]
+; CHECK-GI-NEXT: ldr w9, [sp, #792]
+; CHECK-GI-NEXT: mov v17.b[7], w11
; CHECK-GI-NEXT: ldr w11, [sp, #912]
-; CHECK-GI-NEXT: mov v17.b[7], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1416]
+; CHECK-GI-NEXT: mov v18.b[7], w8
+; CHECK-GI-NEXT: mov v19.b[7], w12
+; CHECK-GI-NEXT: ldr w8, [sp, #1040]
; CHECK-GI-NEXT: mov v6.b[8], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1040]
-; CHECK-GI-NEXT: mov v18.b[7], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1168]
-; CHECK-GI-NEXT: mov v19.b[7], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #792]
+; CHECK-GI-NEXT: ldr w10, [sp, #1168]
; CHECK-GI-NEXT: mov v7.b[8], w11
-; CHECK-GI-NEXT: mov v16.b[8], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1296]
-; CHECK-GI-NEXT: ldr w11, [sp, #920]
-; CHECK-GI-NEXT: mov v17.b[8], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1424]
-; CHECK-GI-NEXT: mov v6.b[9], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1048]
-; CHECK-GI-NEXT: mov v18.b[8], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1176]
-; CHECK-GI-NEXT: mov v19.b[8], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #800]
-; CHECK-GI-NEXT: mov v7.b[9], w11
-; CHECK-GI-NEXT: mov v16.b[9], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1304]
-; CHECK-GI-NEXT: ldr w11, [sp, #928]
-; CHECK-GI-NEXT: mov v17.b[9], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1432]
-; CHECK-GI-NEXT: mov v6.b[10], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1056]
-; CHECK-GI-NEXT: mov v18.b[9], w8
+; CHECK-GI-NEXT: ldr w11, [sp, #1296]
+; CHECK-GI-NEXT: ldr w12, [sp, #1424]
+; CHECK-GI-NEXT: mov v16.b[8], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #800]
+; CHECK-GI-NEXT: mov v17.b[8], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #920]
+; CHECK-GI-NEXT: mov v18.b[8], w11
+; CHECK-GI-NEXT: mov v19.b[8], w12
+; CHECK-GI-NEXT: ldr w11, [sp, #1048]
+; CHECK-GI-NEXT: mov v6.b[9], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #1176]
+; CHECK-GI-NEXT: mov v7.b[9], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #1304]
+; CHECK-GI-NEXT: ldr w12, [sp, #1432]
+; CHECK-GI-NEXT: mov v16.b[9], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #808]
+; CHECK-GI-NEXT: mov v17.b[9], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #928]
+; CHECK-GI-NEXT: mov v18.b[9], w10
+; CHECK-GI-NEXT: mov v19.b[9], w12
+; CHECK-GI-NEXT: ldr w10, [sp, #1056]
+; CHECK-GI-NEXT: mov v6.b[10], w8
; CHECK-GI-NEXT: ldr w8, [sp, #1184]
-; CHECK-GI-NEXT: mov v19.b[9], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #808]
-; CHECK-GI-NEXT: mov v7.b[10], w11
-; CHECK-GI-NEXT: mov v16.b[10], w9
+; CHECK-GI-NEXT: mov v7.b[10], w9
; CHECK-GI-NEXT: ldr w9, [sp, #1312]
-; CHECK-GI-NEXT: ldr w11, [sp, #936]
+; CHECK-GI-NEXT: ldr w12, [sp, #1440]
+; CHECK-GI-NEXT: mov v16.b[10], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #816]
; CHECK-GI-NEXT: mov v17.b[10], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1440]
-; CHECK-GI-NEXT: mov v6.b[11], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1064]
+; CHECK-GI-NEXT: ldr w8, [sp, #936]
; CHECK-GI-NEXT: mov v18.b[10], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1192]
-; CHECK-GI-NEXT: mov v19.b[10], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #816]
-; CHECK-GI-NEXT: mov v7.b[11], w11
-; CHECK-GI-NEXT: mov v16.b[11], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1320]
+; CHECK-GI-NEXT: mov v19.b[10], w12
+; CHECK-GI-NEXT: ldr w9, [sp, #1064]
+; CHECK-GI-NEXT: mov v6.b[11], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #1192]
+; CHECK-GI-NEXT: mov v7.b[11], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #1320]
+; CHECK-GI-NEXT: ldr w12, [sp, #1448]
+; CHECK-GI-NEXT: mov v16.b[11], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #824]
+; CHECK-GI-NEXT: mov v17.b[11], w11
; CHECK-GI-NEXT: ldr w11, [sp, #944]
-; CHECK-GI-NEXT: mov v17.b[11], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1448]
-; CHECK-GI-NEXT: mov v6.b[12], w8
+; CHECK-GI-NEXT: mov v18.b[11], w8
+; CHECK-GI-NEXT: mov v19.b[11], w12
; CHECK-GI-NEXT: ldr w8, [sp, #1072]
-; CHECK-GI-NEXT: mov v18.b[11], w10
+; CHECK-GI-NEXT: mov v6.b[12], w10
; CHECK-GI-NEXT: ldr w10, [sp, #1200]
-; CHECK-GI-NEXT: mov v19.b[11], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #824]
; CHECK-GI-NEXT: mov v7.b[12], w11
+; CHECK-GI-NEXT: ldr w11, [sp, #1328]
+; CHECK-GI-NEXT: ldr w12, [sp, #1456]
; CHECK-GI-NEXT: mov v16.b[12], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1328]
-; CHECK-GI-NEXT: ldr w11, [sp, #952]
+; CHECK-GI-NEXT: ldr w8, [sp, #832]
; CHECK-GI-NEXT: mov v17.b[12], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1456]
+; CHECK-GI-NEXT: ldr w10, [sp, #952]
+; CHECK-GI-NEXT: mov v18.b[12], w11
+; CHECK-GI-NEXT: mov v19.b[12], w12
+; CHECK-GI-NEXT: ldr w11, [sp, #1080]
; CHECK-GI-NEXT: mov v6.b[13], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1080]
-; CHECK-GI-NEXT: mov v18.b[12], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1208]
-; CHECK-GI-NEXT: mov v19.b[12], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #832]
-; CHECK-GI-NEXT: mov v7.b[13], w11
-; CHECK-GI-NEXT: mov v16.b[13], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1336]
+; CHECK-GI-NEXT: ldr w9, [sp, #1208]
+; CHECK-GI-NEXT: mov v7.b[13], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #1336]
+; CHECK-GI-NEXT: ldr w12, [sp, #1464]
+; CHECK-GI-NEXT: mov v16.b[13], w11
; CHECK-GI-NEXT: ldr w11, [sp, #960]
-; CHECK-GI-NEXT: mov v17.b[13], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1464]
-; CHECK-GI-NEXT: mov v6.b[14], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #1088]
-; CHECK-GI-NEXT: mov v18.b[13], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #1216]
-; CHECK-GI-NEXT: mov v19.b[13], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #840]
+; CHECK-GI-NEXT: mov v17.b[13], w9
+; CHECK-GI-NEXT: mov v18.b[13], w10
+; CHECK-GI-NEXT: ldr w9, [sp, #1088]
+; CHECK-GI-NEXT: mov v19.b[13], w12
+; CHECK-GI-NEXT: mov v6.b[14], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #1216]
; CHECK-GI-NEXT: mov v7.b[14], w11
-; CHECK-GI-NEXT: mov v16.b[14], w10
; CHECK-GI-NEXT: ldr w10, [sp, #1344]
-; CHECK-GI-NEXT: ldr w11, [sp, #968]
-; CHECK-GI-NEXT: mov v17.b[14], w9
-; CHECK-GI-NEXT: mov v6.b[15], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1096]
+; CHECK-GI-NEXT: ldr w11, [sp, #1472]
+; CHECK-GI-NEXT: ldr w12, [sp, #840]
+; CHECK-GI-NEXT: mov v16.b[14], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #1096]
+; CHECK-GI-NEXT: mov v17.b[14], w8
; CHECK-GI-NEXT: mov v18.b[14], w10
-; CHECK-GI-NEXT: ldr w9, [sp, #1472]
+; CHECK-GI-NEXT: ldr w8, [sp, #968]
+; CHECK-GI-NEXT: mov v19.b[14], w11
; CHECK-GI-NEXT: ldr w10, [sp, #1224]
-; CHECK-GI-NEXT: mov v7.b[15], w11
-; CHECK-GI-NEXT: addv s4, v25.4s
-; CHECK-GI-NEXT: mov v16.b[15], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #1352]
-; CHECK-GI-NEXT: mov v19.b[14], w9
+; CHECK-GI-NEXT: mov v6.b[15], w12
+; CHECK-GI-NEXT: ldr w11, [sp, #1352]
+; CHECK-GI-NEXT: ldr w12, [sp, #1480]
+; CHECK-GI-NEXT: mov v7.b[15], w8
+; CHECK-GI-NEXT: mov v16.b[15], w9
+; CHECK-GI-NEXT: movi v24.2d, #0000000000000000
+; CHECK-GI-NEXT: movi v25.2d, #0000000000000000
; CHECK-GI-NEXT: mov v17.b[15], w10
-; CHECK-GI-NEXT: ldr w9, [sp, #1480]
-; CHECK-GI-NEXT: mov v18.b[15], w8
+; CHECK-GI-NEXT: mov v18.b[15], w11
+; CHECK-GI-NEXT: movi v26.2d, #0000000000000000
+; CHECK-GI-NEXT: mov v19.b[15], w12
+; CHECK-GI-NEXT: sdot v21.4s, v0.16b, v3.16b
+; CHECK-GI-NEXT: sdot v22.4s, v1.16b, v4.16b
+; CHECK-GI-NEXT: sdot v23.4s, v2.16b, v5.16b
+; CHECK-GI-NEXT: mov v20.s[3], wzr
+; CHECK-GI-NEXT: sdot v25.4s, v6.16b, v17.16b
+; CHECK-GI-NEXT: sdot v26.4s, v7.16b, v18.16b
+; CHECK-GI-NEXT: sdot v24.4s, v16.16b, v19.16b
+; CHECK-GI-NEXT: add v0.4s, v21.4s, v22.4s
+; CHECK-GI-NEXT: add v1.4s, v23.4s, v20.4s
+; CHECK-GI-NEXT: add v2.4s, v25.4s, v26.4s
+; CHECK-GI-NEXT: add v3.4s, v24.4s, v20.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: addv s1, v1.4s
; CHECK-GI-NEXT: fmov w8, s0
-; CHECK-GI-NEXT: fmov w11, s4
-; CHECK-GI-NEXT: mov v19.b[15], w9
; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: sdot v22.4s, v6.16b, v17.16b
-; CHECK-GI-NEXT: sdot v23.4s, v7.16b, v18.16b
-; CHECK-GI-NEXT: add w8, w8, w9
-; CHECK-GI-NEXT: sdot v24.4s, v16.16b, v19.16b
-; CHECK-GI-NEXT: add w8, w8, w11
-; CHECK-GI-NEXT: addv s2, v22.4s
-; CHECK-GI-NEXT: addv s3, v23.4s
-; CHECK-GI-NEXT: addv s5, v24.4s
-; CHECK-GI-NEXT: fmov w9, s2
-; CHECK-GI-NEXT: fmov w10, s3
-; CHECK-GI-NEXT: add w9, w9, w10
-; CHECK-GI-NEXT: fmov w10, s5
-; CHECK-GI-NEXT: add w9, w9, w10
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-GI-NEXT: ret
@@ -4738,218 +6885,216 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
; CHECK-GI-NEXT: .cfi_offset w29, -16
; CHECK-GI-NEXT: ldr w10, [sp, #80]
-; CHECK-GI-NEXT: ldr w11, [sp, #208]
-; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: ldr w9, [sp, #88]
-; CHECK-GI-NEXT: ldr w12, [sp, #728]
-; CHECK-GI-NEXT: movi v6.16b, #1
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: ldr w11, [sp, #208]
+; CHECK-GI-NEXT: ldr w8, [sp, #216]
+; CHECK-GI-NEXT: mov v6.s[0], wzr
; CHECK-GI-NEXT: fmov s1, w10
+; CHECK-GI-NEXT: ldr w10, [sp, #720]
+; CHECK-GI-NEXT: movi v7.16b, #1
; CHECK-GI-NEXT: fmov s2, w11
-; CHECK-GI-NEXT: ldr w11, [sp, #720]
-; CHECK-GI-NEXT: ldr w10, [sp, #216]
; CHECK-GI-NEXT: mov v0.b[1], w1
-; CHECK-GI-NEXT: ldr w13, [sp, #856]
-; CHECK-GI-NEXT: fmov s3, w11
-; CHECK-GI-NEXT: ldr w8, [sp, #96]
-; CHECK-GI-NEXT: ldr w11, [sp, #224]
-; CHECK-GI-NEXT: mov v1.b[1], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #848]
-; CHECK-GI-NEXT: mov v2.b[1], w10
+; CHECK-GI-NEXT: ldr w11, [sp, #728]
+; CHECK-GI-NEXT: fmov s3, w10
; CHECK-GI-NEXT: ldr w10, [sp, #976]
-; CHECK-GI-NEXT: movi v7.2d, #0000000000000000
; CHECK-GI-NEXT: movi v16.2d, #0000000000000000
+; CHECK-GI-NEXT: mov v1.b[1], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #848]
+; CHECK-GI-NEXT: mov v6.s[1], wzr
+; CHECK-GI-NEXT: fmov s5, w10
+; CHECK-GI-NEXT: mov v2.b[1], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #856]
; CHECK-GI-NEXT: fmov s4, w9
-; CHECK-GI-NEXT: mov v3.b[1], w12
; CHECK-GI-NEXT: ldr w9, [sp, #984]
-; CHECK-GI-NEXT: fmov s5, w10
+; CHECK-GI-NEXT: mov v3.b[1], w11
+; CHECK-GI-NEXT: ldr w10, [sp, #224]
; CHECK-GI-NEXT: mov v0.b[2], w2
-; CHECK-GI-NEXT: ldr w10, [sp, #736]
-; CHECK-GI-NEXT: mov v1.b[2], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #864]
-; CHECK-GI-NEXT: mov v2.b[2], w11
-; CHECK-GI-NEXT: mov v4.b[1], w13
-; CHECK-GI-NEXT: ldr w11, [sp, #992]
-; CHECK-GI-NEXT: ldr w12, [sp, #776]
+; CHECK-GI-NEXT: ldr w11, [sp, #16]
; CHECK-GI-NEXT: mov v5.b[1], w9
-; CHECK-GI-NEXT: mov v3.b[2], w10
-; CHECK-GI-NEXT: ldr w9, [sp, #104]
-; CHECK-GI-NEXT: ldr w10, [sp, #232]
-; CHECK-GI-NEXT: mov v0.b[3], w3
+; CHECK-GI-NEXT: ldr w9, [sp, #864]
; CHECK-GI-NEXT: movi v17.2d, #0000000000000000
-; CHECK-GI-NEXT: mov v1.b[3], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #872]
+; CHECK-GI-NEXT: mov v4.b[1], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #96]
+; CHECK-GI-NEXT: mov v2.b[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #992]
; CHECK-GI-NEXT: movi v18.2d, #0000000000000000
-; CHECK-GI-NEXT: mov v4.b[2], w8
+; CHECK-GI-NEXT: movi v19.2d, #0000000000000000
+; CHECK-GI-NEXT: mov v1.b[2], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #736]
+; CHECK-GI-NEXT: mov v0.b[3], w3
+; CHECK-GI-NEXT: mov v5.b[2], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #872]
+; CHECK-GI-NEXT: movi v20.2d, #0000000000000000
+; CHECK-GI-NEXT: mov v3.b[2], w8
+; CHECK-GI-NEXT: mov v4.b[2], w9
+; CHECK-GI-NEXT: ldr w8, [sp, #104]
+; CHECK-GI-NEXT: ldr w9, [sp, #232]
+; CHECK-GI-NEXT: mov v6.s[2], wzr
+; CHECK-GI-NEXT: movi v21.2d, #0000000000000000
+; CHECK-GI-NEXT: mov v1.b[3], w8
; CHECK-GI-NEXT: ldr w8, [sp, #744]
-; CHECK-GI-NEXT: mov v2.b[3], w10
-; CHECK-GI-NEXT: mov v5.b[2], w11
-; CHECK-GI-NEXT: ldr w11, [sp, #1000]
-; CHECK-GI-NEXT: ldr w10, [sp, #240]
+; CHECK-GI-NEXT: mov v0.b[4], w4
+; CHECK-GI-NEXT: mov v2.b[3], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #1000]
; CHECK-GI-NEXT: mov v3.b[3], w8
+; CHECK-GI-NEXT: mov v4.b[3], w10
; CHECK-GI-NEXT: ldr w8, [sp, #112]
-; CHECK-GI-NEXT: mov v0.b[4], w4
-; CHECK-GI-NEXT: movi v19.2d, #0000000000000000
-; CHECK-GI-NEXT: movi v20.2d, #0000000000000000
-; CHECK-GI-NEXT: mov v4.b[3], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #752]
+; CHECK-GI-NEXT: ldr w10, [sp, #240]
+; CHECK-GI-NEXT: mov v5.b[3], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #880]
; CHECK-GI-NEXT: mov v1.b[4], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #880]
-; CHECK-GI-NEXT: mov v5.b[3], w11
-; CHECK-GI-NEXT: mov v2.b[4], w10
-; CHECK-GI-NEXT: mov v3.b[4], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #120]
-; CHECK-GI-NEXT: ldr w11, [sp, #1008]
-; CHECK-GI-NEXT: ldr w10, [sp, #248]
+; CHECK-GI-NEXT: ldr w8, [sp, #752]
; CHECK-GI-NEXT: mov v0.b[5], w5
-; CHECK-GI-NEXT: mov v4.b[4], w8
+; CHECK-GI-NEXT: mov v2.b[4], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #1008]
+; CHECK-GI-NEXT: mov v6.s[3], wzr
+; CHECK-GI-NEXT: mov v3.b[4], w8
+; CHECK-GI-NEXT: mov v4.b[4], w9
+; CHECK-GI-NEXT: ldr w8, [sp, #120]
+; CHECK-GI-NEXT: ldr w9, [sp, #248]
+; CHECK-GI-NEXT: mov v5.b[4], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #888]
+; CHECK-GI-NEXT: mov v1.b[5], w8
; CHECK-GI-NEXT: ldr w8, [sp, #760]
-; CHECK-GI-NEXT: mov v1.b[5], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #888]
-; CHECK-GI-NEXT: mov v5.b[4], w11
-; CHECK-GI-NEXT: mov v2.b[5], w10
+; CHECK-GI-NEXT: mov v0.b[6], w6
+; CHECK-GI-NEXT: mov v2.b[5], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #1016]
; CHECK-GI-NEXT: mov v3.b[5], w8
+; CHECK-GI-NEXT: mov v4.b[5], w10
; CHECK-GI-NEXT: ldr w8, [sp, #128]
-; CHECK-GI-NEXT: ldr w11, [sp, #1016]
; CHECK-GI-NEXT: ldr w10, [sp, #256]
-; CHECK-GI-NEXT: mov v0.b[6], w6
-; CHECK-GI-NEXT: mov v4.b[5], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #768]
+; CHECK-GI-NEXT: mov v5.b[5], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #896]
; CHECK-GI-NEXT: mov v1.b[6], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #896]
-; CHECK-GI-NEXT: mov v5.b[5], w11
-; CHECK-GI-NEXT: mov v2.b[6], w10
-; CHECK-GI-NEXT: mov v3.b[6], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #136]
-; CHECK-GI-NEXT: ldr w11, [sp, #1024]
-; CHECK-GI-NEXT: ldr w10, [sp, #264]
+; CHECK-GI-NEXT: ldr w8, [sp, #768]
; CHECK-GI-NEXT: mov v0.b[7], w7
-; CHECK-GI-NEXT: mov v4.b[6], w8
-; CHECK-GI-NEXT: mov v1.b[7], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #904]
-; CHECK-GI-NEXT: mov v5.b[6], w11
-; CHECK-GI-NEXT: mov v2.b[7], w10
-; CHECK-GI-NEXT: ldr w8, [sp, #16]
-; CHECK-GI-NEXT: mov v3.b[7], w12
-; CHECK-GI-NEXT: ldr w10, [sp, #144]
-; CHECK-GI-NEXT: ldr w12, [sp, #1032]
-; CHECK-GI-NEXT: mov v0.b[8], w8
+; CHECK-GI-NEXT: mov v2.b[6], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #1024]
+; CHECK-GI-NEXT: mov v3.b[6], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #136]
+; CHECK-GI-NEXT: mov v4.b[6], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #264]
+; CHECK-GI-NEXT: mov v5.b[6], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #904]
+; CHECK-GI-NEXT: mov v1.b[7], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #776]
+; CHECK-GI-NEXT: mov v0.b[8], w11
+; CHECK-GI-NEXT: mov v2.b[7], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #1032]
+; CHECK-GI-NEXT: ldr w11, [sp, #24]
+; CHECK-GI-NEXT: mov v3.b[7], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #144]
+; CHECK-GI-NEXT: mov v4.b[7], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #272]
+; CHECK-GI-NEXT: mov v5.b[7], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #912]
+; CHECK-GI-NEXT: mov v1.b[8], w8
; CHECK-GI-NEXT: ldr w8, [sp, #784]
-; CHECK-GI-NEXT: ldr w11, [sp, #272]
-; CHECK-GI-NEXT: mov v4.b[7], w9
-; CHECK-GI-NEXT: mov v1.b[8], w10
-; CHECK-GI-NEXT: ldr w10, [sp, #912]
-; CHECK-GI-NEXT: mov v5.b[7], w12
-; CHECK-GI-NEXT: ldr w9, [sp, #24]
-; CHECK-GI-NEXT: ldr w12, [sp, #1040]
+; CHECK-GI-NEXT: mov v0.b[9], w11
+; CHECK-GI-NEXT: mov v2.b[8], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #1040]
+; CHECK-GI-NEXT: ldr w11, [sp, #32]
; CHECK-GI-NEXT: mov v3.b[8], w8
; CHECK-GI-NEXT: ldr w8, [sp, #152]
-; CHECK-GI-NEXT: mov v2.b[8], w11
-; CHECK-GI-NEXT: mov v0.b[9], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #792]
-; CHECK-GI-NEXT: ldr w11, [sp, #280]
-; CHECK-GI-NEXT: mov v4.b[8], w10
-; CHECK-GI-NEXT: mov v1.b[9], w8
+; CHECK-GI-NEXT: mov v4.b[8], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #280]
+; CHECK-GI-NEXT: mov v5.b[8], w10
; CHECK-GI-NEXT: ldr w10, [sp, #920]
-; CHECK-GI-NEXT: mov v5.b[8], w12
-; CHECK-GI-NEXT: ldr w8, [sp, #32]
-; CHECK-GI-NEXT: ldr w12, [sp, #1048]
-; CHECK-GI-NEXT: mov v3.b[9], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #160]
-; CHECK-GI-NEXT: mov v2.b[9], w11
-; CHECK-GI-NEXT: mov v0.b[10], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #800]
-; CHECK-GI-NEXT: ldr w11, [sp, #288]
+; CHECK-GI-NEXT: mov v1.b[9], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #792]
+; CHECK-GI-NEXT: mov v0.b[10], w11
+; CHECK-GI-NEXT: mov v2.b[9], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #1048]
+; CHECK-GI-NEXT: ldr w11, [sp, #40]
+; CHECK-GI-NEXT: mov v3.b[9], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #160]
; CHECK-GI-NEXT: mov v4.b[9], w10
-; CHECK-GI-NEXT: mov v1.b[10], w9
-; CHECK-GI-NEXT: ldr w10, [sp, #928]
-; CHECK-GI-NEXT: mov v5.b[9], w12
-; CHECK-GI-NEXT: ldr w9, [sp, #40]
-; CHECK-GI-NEXT: ldr w12, [sp, #1056]
+; CHECK-GI-NEXT: ldr w10, [sp, #288]
+; CHECK-GI-NEXT: mov v5.b[9], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #928]
+; CHECK-GI-NEXT: mov v1.b[10], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #800]
+; CHECK-GI-NEXT: mov v0.b[11], w11
+; CHECK-GI-NEXT: mov v2.b[10], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #1056]
+; CHECK-GI-NEXT: ldr w11, [sp, #48]
; CHECK-GI-NEXT: mov v3.b[10], w8
; CHECK-GI-NEXT: ldr w8, [sp, #168]
-; CHECK-GI-NEXT: mov v2.b[10], w11
-; CHECK-GI-NEXT: mov v0.b[11], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #808]
-; CHECK-GI-NEXT: ldr w11, [sp, #296]
-; CHECK-GI-NEXT: mov v4.b[10], w10
-; CHECK-GI-NEXT: mov v1.b[11], w8
+; CHECK-GI-NEXT: mov v4.b[10], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #296]
+; CHECK-GI-NEXT: mov v5.b[10], w10
; CHECK-GI-NEXT: ldr w10, [sp, #936]
-; CHECK-GI-NEXT: mov v5.b[10], w12
-; CHECK-GI-NEXT: ldr w8, [sp, #48]
-; CHECK-GI-NEXT: ldr w12, [sp, #1064]
-; CHECK-GI-NEXT: mov v3.b[11], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #176]
-; CHECK-GI-NEXT: mov v2.b[11], w11
-; CHECK-GI-NEXT: mov v0.b[12], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #816]
-; CHECK-GI-NEXT: ldr w11, [sp, #304]
+; CHECK-GI-NEXT: mov v1.b[11], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #808]
+; CHECK-GI-NEXT: mov v0.b[12], w11
+; CHECK-GI-NEXT: mov v2.b[11], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #1064]
+; CHECK-GI-NEXT: ldr w11, [sp, #56]
+; CHECK-GI-NEXT: mov v3.b[11], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #176]
; CHECK-GI-NEXT: mov v4.b[11], w10
-; CHECK-GI-NEXT: mov v1.b[12], w9
-; CHECK-GI-NEXT: ldr w10, [sp, #944]
-; CHECK-GI-NEXT: mov v5.b[11], w12
-; CHECK-GI-NEXT: ldr w9, [sp, #56]
-; CHECK-GI-NEXT: ldr w12, [sp, #1072]
+; CHECK-GI-NEXT: ldr w10, [sp, #304]
+; CHECK-GI-NEXT: mov v5.b[11], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #944]
+; CHECK-GI-NEXT: mov v1.b[12], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #816]
+; CHECK-GI-NEXT: mov v0.b[13], w11
+; CHECK-GI-NEXT: mov v2.b[12], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #1072]
+; CHECK-GI-NEXT: ldr w11, [sp, #64]
; CHECK-GI-NEXT: mov v3.b[12], w8
; CHECK-GI-NEXT: ldr w8, [sp, #184]
-; CHECK-GI-NEXT: mov v2.b[12], w11
-; CHECK-GI-NEXT: mov v0.b[13], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #824]
-; CHECK-GI-NEXT: ldr w11, [sp, #312]
-; CHECK-GI-NEXT: mov v4.b[12], w10
-; CHECK-GI-NEXT: mov v1.b[13], w8
+; CHECK-GI-NEXT: mov v4.b[12], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #312]
+; CHECK-GI-NEXT: mov v5.b[12], w10
; CHECK-GI-NEXT: ldr w10, [sp, #952]
-; CHECK-GI-NEXT: mov v5.b[12], w12
-; CHECK-GI-NEXT: ldr w8, [sp, #64]
-; CHECK-GI-NEXT: ldr w12, [sp, #1080]
-; CHECK-GI-NEXT: mov v3.b[13], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #192]
-; CHECK-GI-NEXT: mov v2.b[13], w11
-; CHECK-GI-NEXT: mov v0.b[14], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #832]
-; CHECK-GI-NEXT: ldr w11, [sp, #320]
+; CHECK-GI-NEXT: mov v1.b[13], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #824]
+; CHECK-GI-NEXT: mov v0.b[14], w11
+; CHECK-GI-NEXT: mov v2.b[13], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #1080]
+; CHECK-GI-NEXT: ldr w11, [sp, #72]
+; CHECK-GI-NEXT: mov v3.b[13], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #192]
; CHECK-GI-NEXT: mov v4.b[13], w10
-; CHECK-GI-NEXT: mov v1.b[14], w9
-; CHECK-GI-NEXT: ldr w10, [sp, #960]
-; CHECK-GI-NEXT: mov v5.b[13], w12
-; CHECK-GI-NEXT: ldr w9, [sp, #72]
-; CHECK-GI-NEXT: ldr w12, [sp, #1088]
+; CHECK-GI-NEXT: ldr w10, [sp, #320]
+; CHECK-GI-NEXT: mov v5.b[13], w9
+; CHECK-GI-NEXT: ldr w9, [sp, #960]
+; CHECK-GI-NEXT: mov v1.b[14], w8
+; CHECK-GI-NEXT: ldr w8, [sp, #832]
+; CHECK-GI-NEXT: mov v0.b[15], w11
+; CHECK-GI-NEXT: mov v2.b[14], w10
+; CHECK-GI-NEXT: ldr w10, [sp, #1088]
+; CHECK-GI-NEXT: ldr w11, [sp, #968]
; CHECK-GI-NEXT: mov v3.b[14], w8
+; CHECK-GI-NEXT: mov v4.b[14], w9
; CHECK-GI-NEXT: ldr w8, [sp, #200]
-; CHECK-GI-NEXT: mov v2.b[14], w11
-; CHECK-GI-NEXT: mov v0.b[15], w9
-; CHECK-GI-NEXT: ldr w9, [sp, #840]
-; CHECK-GI-NEXT: ldr w11, [sp, #328]
-; CHECK-GI-NEXT: mov v4.b[14], w10
+; CHECK-GI-NEXT: mov v5.b[14], w10
+; CHECK-GI-NEXT: ldr w9, [sp, #328]
+; CHECK-GI-NEXT: ldr w10, [sp, #840]
; CHECK-GI-NEXT: mov v1.b[15], w8
-; CHECK-GI-NEXT: ldr w8, [sp, #968]
-; CHECK-GI-NEXT: mov v5.b[14], w12
-; CHECK-GI-NEXT: ldr w10, [sp, #1096]
-; CHECK-GI-NEXT: mov v3.b[15], w9
-; CHECK-GI-NEXT: mov v2.b[15], w11
-; CHECK-GI-NEXT: sdot v7.4s, v0.16b, v6.16b
-; CHECK-GI-NEXT: mov v4.b[15], w8
-; CHECK-GI-NEXT: sdot v16.4s, v1.16b, v6.16b
-; CHECK-GI-NEXT: mov v5.b[15], w10
-; CHECK-GI-NEXT: sdot v17.4s, v3.16b, v6.16b
-; CHECK-GI-NEXT: sdot v20.4s, v2.16b, v6.16b
-; CHECK-GI-NEXT: addv s0, v7.4s
-; CHECK-GI-NEXT: sdot v18.4s, v4.16b, v6.16b
-; CHECK-GI-NEXT: addv s1, v16.4s
-; CHECK-GI-NEXT: sdot v19.4s, v5.16b, v6.16b
-; CHECK-GI-NEXT: addv s2, v17.4s
-; CHECK-GI-NEXT: addv s4, v20.4s
+; CHECK-GI-NEXT: ldr w8, [sp, #1096]
+; CHECK-GI-NEXT: sdot v16.4s, v0.16b, v7.16b
+; CHECK-GI-NEXT: mov v2.b[15], w9
+; CHECK-GI-NEXT: mov v3.b[15], w10
+; CHECK-GI-NEXT: mov v4.b[15], w11
+; CHECK-GI-NEXT: mov v5.b[15], w8
+; CHECK-GI-NEXT: sdot v17.4s, v1.16b, v7.16b
+; CHECK-GI-NEXT: sdot v18.4s, v2.16b, v7.16b
+; CHECK-GI-NEXT: sdot v19.4s, v3.16b, v7.16b
+; CHECK-GI-NEXT: sdot v21.4s, v4.16b, v7.16b
+; CHECK-GI-NEXT: sdot v20.4s, v5.16b, v7.16b
+; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s
+; CHECK-GI-NEXT: add v1.4s, v18.4s, v6.4s
+; CHECK-GI-NEXT: add v2.4s, v19.4s, v21.4s
+; CHECK-GI-NEXT: add v3.4s, v20.4s, v6.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: addv s1, v1.4s
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: fmov w9, s1
-; CHECK-GI-NEXT: addv s3, v18.4s
-; CHECK-GI-NEXT: addv s5, v19.4s
-; CHECK-GI-NEXT: fmov w10, s2
-; CHECK-GI-NEXT: add w8, w8, w9
-; CHECK-GI-NEXT: fmov w9, s4
-; CHECK-GI-NEXT: fmov w11, s3
-; CHECK-GI-NEXT: add w8, w8, w9
-; CHECK-GI-NEXT: add w10, w10, w11
-; CHECK-GI-NEXT: fmov w11, s5
-; CHECK-GI-NEXT: add w9, w10, w11
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 5d6b523f1549ac..c72d00e65fcab1 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2069,39 +2069,32 @@ define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-BASE-LABEL: test_udot_v24i8:
; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: ldr q0, [x0]
-; CHECK-GI-BASE-NEXT: ldr q1, [x1]
+; CHECK-GI-BASE-NEXT: mov v0.s[0], wzr
+; CHECK-GI-BASE-NEXT: ldr q1, [x0]
; CHECK-GI-BASE-NEXT: ldr d2, [x0, #16]
-; CHECK-GI-BASE-NEXT: ldr d3, [x1, #16]
-; CHECK-GI-BASE-NEXT: ushll v4.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT: ldr q3, [x1]
+; CHECK-GI-BASE-NEXT: ldr d4, [x1, #16]
; CHECK-GI-BASE-NEXT: ushll v5.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT: ushll v3.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT: umull v6.4s, v5.4h, v4.4h
-; CHECK-GI-BASE-NEXT: umull2 v4.4s, v5.8h, v4.8h
-; CHECK-GI-BASE-NEXT: umull2 v5.4s, v1.8h, v0.8h
-; CHECK-GI-BASE-NEXT: umull v7.4s, v3.4h, v2.4h
-; CHECK-GI-BASE-NEXT: umull v0.4s, v1.4h, v0.4h
-; CHECK-GI-BASE-NEXT: umull2 v1.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT: addv s2, v6.4s
-; CHECK-GI-BASE-NEXT: addv s3, v4.4s
-; CHECK-GI-BASE-NEXT: addv s4, v5.4s
-; CHECK-GI-BASE-NEXT: addv s5, v7.4s
+; CHECK-GI-BASE-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT: ushll v6.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT: ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT: ushll v4.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT: mov v0.s[1], wzr
+; CHECK-GI-BASE-NEXT: umull v7.4s, v6.4h, v5.4h
+; CHECK-GI-BASE-NEXT: umull v16.4s, v3.4h, v1.4h
+; CHECK-GI-BASE-NEXT: umull v17.4s, v4.4h, v2.4h
+; CHECK-GI-BASE-NEXT: mov v0.s[2], wzr
+; CHECK-GI-BASE-NEXT: umlal2 v7.4s, v6.8h, v5.8h
+; CHECK-GI-BASE-NEXT: umlal2 v16.4s, v3.8h, v1.8h
+; CHECK-GI-BASE-NEXT: umlal2 v17.4s, v4.8h, v2.8h
+; CHECK-GI-BASE-NEXT: mov v0.s[3], wzr
+; CHECK-GI-BASE-NEXT: add v1.4s, v7.4s, v16.4s
+; CHECK-GI-BASE-NEXT: add v0.4s, v0.4s, v0.4s
+; CHECK-GI-BASE-NEXT: add v0.4s, v17.4s, v0.4s
+; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
-; CHECK-GI-BASE-NEXT: addv s1, v1.4s
-; CHECK-GI-BASE-NEXT: fmov w8, s2
-; CHECK-GI-BASE-NEXT: fmov w9, s3
-; CHECK-GI-BASE-NEXT: fmov w10, s4
-; CHECK-GI-BASE-NEXT: fmov w11, s5
-; CHECK-GI-BASE-NEXT: add w8, w8, w9
-; CHECK-GI-BASE-NEXT: fmov w9, s0
-; CHECK-GI-BASE-NEXT: add w10, w10, w11
-; CHECK-GI-BASE-NEXT: fmov w11, s1
-; CHECK-GI-BASE-NEXT: add w8, w8, w9
-; CHECK-GI-BASE-NEXT: add w9, w10, w11
-; CHECK-GI-BASE-NEXT: add w0, w8, w9
+; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v24i8:
@@ -2172,91 +2165,71 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-BASE-LABEL: test_udot_v48i8:
; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: ldp q0, q3, [x1]
-; CHECK-GI-BASE-NEXT: ldr q6, [x0, #32]
-; CHECK-GI-BASE-NEXT: ldp q1, q2, [x0]
-; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32]
-; CHECK-GI-BASE-NEXT: ushll v20.8h, v6.8b, #0
-; CHECK-GI-BASE-NEXT: ushll2 v6.8h, v6.16b, #0
-; CHECK-GI-BASE-NEXT: ushll v4.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT: ushll v16.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT: ushll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT: mov v0.s[0], wzr
+; CHECK-GI-BASE-NEXT: ldp q1, q5, [x1]
+; CHECK-GI-BASE-NEXT: ldp q2, q3, [x0]
+; CHECK-GI-BASE-NEXT: ldr q4, [x0, #32]
+; CHECK-GI-BASE-NEXT: ldr q6, [x1, #32]
+; CHECK-GI-BASE-NEXT: ushll v7.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT: ushll v17.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT: ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT: ushll v16.8h, v5.8b, #0
+; CHECK-GI-BASE-NEXT: mov v0.s[1], wzr
+; CHECK-GI-BASE-NEXT: ushll2 v5.8h, v5.16b, #0
+; CHECK-GI-BASE-NEXT: ushll v17.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT: ushll2 v6.8h, v6.16b, #0
+; CHECK-GI-BASE-NEXT: ushll v18.8h, v2.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v2.8h, v2.16b, #0
-; CHECK-GI-BASE-NEXT: umull v18.4s, v4.4h, v5.4h
-; CHECK-GI-BASE-NEXT: umull2 v4.4s, v4.8h, v5.8h
-; CHECK-GI-BASE-NEXT: umull v5.4s, v0.4h, v1.4h
-; CHECK-GI-BASE-NEXT: umull2 v0.4s, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT: umull v19.4s, v16.4h, v17.4h
-; CHECK-GI-BASE-NEXT: ushll v1.8h, v7.8b, #0
-; CHECK-GI-BASE-NEXT: umull2 v16.4s, v16.8h, v17.8h
-; CHECK-GI-BASE-NEXT: umull v17.4s, v3.4h, v2.4h
-; CHECK-GI-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT: ushll2 v7.8h, v7.16b, #0
-; CHECK-GI-BASE-NEXT: addv s18, v18.4s
-; CHECK-GI-BASE-NEXT: addv s4, v4.4s
-; CHECK-GI-BASE-NEXT: addv s5, v5.4s
-; CHECK-GI-BASE-NEXT: addv s0, v0.4s
-; CHECK-GI-BASE-NEXT: addv s19, v19.4s
-; CHECK-GI-BASE-NEXT: umull v3.4s, v1.4h, v20.4h
-; CHECK-GI-BASE-NEXT: addv s2, v2.4s
-; CHECK-GI-BASE-NEXT: umull2 v1.4s, v1.8h, v20.8h
-; CHECK-GI-BASE-NEXT: umull v20.4s, v7.4h, v6.4h
-; CHECK-GI-BASE-NEXT: fmov w8, s18
-; CHECK-GI-BASE-NEXT: fmov w9, s4
-; CHECK-GI-BASE-NEXT: fmov w10, s5
-; CHECK-GI-BASE-NEXT: fmov w11, s0
-; CHECK-GI-BASE-NEXT: fmov w12, s19
-; CHECK-GI-BASE-NEXT: addv s4, v16.4s
-; CHECK-GI-BASE-NEXT: addv s5, v17.4s
-; CHECK-GI-BASE-NEXT: addv s3, v3.4s
-; CHECK-GI-BASE-NEXT: umull2 v0.4s, v7.8h, v6.8h
-; CHECK-GI-BASE-NEXT: add w8, w8, w9
-; CHECK-GI-BASE-NEXT: addv s1, v1.4s
-; CHECK-GI-BASE-NEXT: add w9, w11, w12
-; CHECK-GI-BASE-NEXT: add w8, w8, w10
-; CHECK-GI-BASE-NEXT: fmov w10, s4
-; CHECK-GI-BASE-NEXT: fmov w11, s5
-; CHECK-GI-BASE-NEXT: fmov w12, s2
-; CHECK-GI-BASE-NEXT: addv s4, v20.4s
+; CHECK-GI-BASE-NEXT: ushll v19.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT: ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT: ushll v20.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT: ushll2 v4.8h, v4.16b, #0
+; CHECK-GI-BASE-NEXT: mov v0.s[2], wzr
+; CHECK-GI-BASE-NEXT: umull v21.4s, v7.4h, v18.4h
+; CHECK-GI-BASE-NEXT: umull v22.4s, v1.4h, v2.4h
+; CHECK-GI-BASE-NEXT: umull v23.4s, v16.4h, v19.4h
+; CHECK-GI-BASE-NEXT: umull v24.4s, v5.4h, v3.4h
+; CHECK-GI-BASE-NEXT: umull v25.4s, v17.4h, v20.4h
+; CHECK-GI-BASE-NEXT: umull v26.4s, v6.4h, v4.4h
+; CHECK-GI-BASE-NEXT: mov v0.s[3], wzr
+; CHECK-GI-BASE-NEXT: umlal2 v21.4s, v7.8h, v18.8h
+; CHECK-GI-BASE-NEXT: umlal2 v22.4s, v1.8h, v2.8h
+; CHECK-GI-BASE-NEXT: umlal2 v23.4s, v16.8h, v19.8h
+; CHECK-GI-BASE-NEXT: umlal2 v24.4s, v5.8h, v3.8h
+; CHECK-GI-BASE-NEXT: umlal2 v25.4s, v17.8h, v20.8h
+; CHECK-GI-BASE-NEXT: umlal2 v26.4s, v6.8h, v4.8h
+; CHECK-GI-BASE-NEXT: add v0.4s, v0.4s, v0.4s
+; CHECK-GI-BASE-NEXT: add v1.4s, v21.4s, v22.4s
+; CHECK-GI-BASE-NEXT: add v2.4s, v23.4s, v24.4s
+; CHECK-GI-BASE-NEXT: add v3.4s, v25.4s, v26.4s
+; CHECK-GI-BASE-NEXT: add v0.4s, v0.4s, v0.4s
+; CHECK-GI-BASE-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-GI-BASE-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
-; CHECK-GI-BASE-NEXT: add w9, w9, w10
-; CHECK-GI-BASE-NEXT: add w10, w11, w12
-; CHECK-GI-BASE-NEXT: fmov w11, s3
-; CHECK-GI-BASE-NEXT: add w8, w8, w9
-; CHECK-GI-BASE-NEXT: add w9, w10, w11
-; CHECK-GI-BASE-NEXT: fmov w10, s1
-; CHECK-GI-BASE-NEXT: fmov w11, s0
-; CHECK-GI-BASE-NEXT: add w9, w9, w10
-; CHECK-GI-BASE-NEXT: fmov w10, s4
-; CHECK-GI-BASE-NEXT: add w8, w8, w9
-; CHECK-GI-BASE-NEXT: add w9, w10, w11
-; CHECK-GI-BASE-NEXT: add w0, w8, w9
+; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v48i8:
; CHECK-GI-DOT: // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: mov v0.s[0], wzr
; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32]
-; CHECK-GI-DOT-NEXT: ldp q3, q4, [x0]
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: ldp q5, q6, [x1]
-; CHECK-GI-DOT-NEXT: ldr q16, [x1, #32]
-; CHECK-GI-DOT-NEXT: udot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: ldr q17, [x1, #32]
+; CHECK-GI-DOT-NEXT: ldp q4, q5, [x0]
+; CHECK-GI-DOT-NEXT: ldp q6, q16, [x1]
+; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr
+; CHECK-GI-DOT-NEXT: udot v2.4s, v17.16b, v7.16b
; CHECK-GI-DOT-NEXT: udot v1.4s, v6.16b, v4.16b
-; CHECK-GI-DOT-NEXT: udot v2.4s, v16.16b, v7.16b
+; CHECK-GI-DOT-NEXT: udot v3.4s, v16.16b, v5.16b
+; CHECK-GI-DOT-NEXT: mov v0.s[2], wzr
+; CHECK-GI-DOT-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-GI-DOT-NEXT: mov v0.s[3], wzr
+; CHECK-GI-DOT-NEXT: add v0.4s, v2.4s, v0.4s
+; CHECK-GI-DOT-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
-; CHECK-GI-DOT-NEXT: addv s1, v1.4s
-; CHECK-GI-DOT-NEXT: addv s2, v2.4s
-; CHECK-GI-DOT-NEXT: fmov w8, s0
-; CHECK-GI-DOT-NEXT: fmov w9, s1
-; CHECK-GI-DOT-NEXT: add w8, w8, w9
-; CHECK-GI-DOT-NEXT: fmov w9, s2
-; CHECK-GI-DOT-NEXT: add w0, w8, w9
+; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <48 x i8>, ptr %p1
@@ -2397,39 +2370,32 @@ define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-BASE-LABEL: test_sdot_v24i8:
; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: ldr q0, [x0]
-; CHECK-GI-BASE-NEXT: ldr q1, [x1]
+; CHECK-GI-BASE-NEXT: mov v0.s[0], wzr
+; CHECK-GI-BASE-NEXT: ldr q1, [x0]
; CHECK-GI-BASE-NEXT: ldr d2, [x0, #16]
-; CHECK-GI-BASE-NEXT: ldr d3, [x1, #16]
-; CHECK-GI-BASE-NEXT: sshll v4.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-BASE-NEXT: ldr q3, [x1]
+; CHECK-GI-BASE-NEXT: ldr d4, [x1, #16]
; CHECK-GI-BASE-NEXT: sshll v5.8h, v1.8b, #0
-; CHECK-GI-BASE-NEXT: sshll v2.8h, v2.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT: sshll v3.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT: smull v6.4s, v5.4h, v4.4h
-; CHECK-GI-BASE-NEXT: smull2 v4.4s, v5.8h, v4.8h
-; CHECK-GI-BASE-NEXT: smull2 v5.4s, v1.8h, v0.8h
-; CHECK-GI-BASE-NEXT: smull v7.4s, v3.4h, v2.4h
-; CHECK-GI-BASE-NEXT: smull v0.4s, v1.4h, v0.4h
-; CHECK-GI-BASE-NEXT: smull2 v1.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT: addv s2, v6.4s
-; CHECK-GI-BASE-NEXT: addv s3, v4.4s
-; CHECK-GI-BASE-NEXT: addv s4, v5.4s
-; CHECK-GI-BASE-NEXT: addv s5, v7.4s
+; CHECK-GI-BASE-NEXT: sshll v2.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT: sshll v6.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT: sshll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT: sshll v4.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT: mov v0.s[1], wzr
+; CHECK-GI-BASE-NEXT: smull v7.4s, v6.4h, v5.4h
+; CHECK-GI-BASE-NEXT: smull v16.4s, v3.4h, v1.4h
+; CHECK-GI-BASE-NEXT: smull v17.4s, v4.4h, v2.4h
+; CHECK-GI-BASE-NEXT: mov v0.s[2], wzr
+; CHECK-GI-BASE-NEXT: smlal2 v7.4s, v6.8h, v5.8h
+; CHECK-GI-BASE-NEXT: smlal2 v16.4s, v3.8h, v1.8h
+; CHECK-GI-BASE-NEXT: smlal2 v17.4s, v4.8h, v2.8h
+; CHECK-GI-BASE-NEXT: mov v0.s[3], wzr
+; CHECK-GI-BASE-NEXT: add v1.4s, v7.4s, v16.4s
+; CHECK-GI-BASE-NEXT: add v0.4s, v0.4s, v0.4s
+; CHECK-GI-BASE-NEXT: add v0.4s, v17.4s, v0.4s
+; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
-; CHECK-GI-BASE-NEXT: addv s1, v1.4s
-; CHECK-GI-BASE-NEXT: fmov w8, s2
-; CHECK-GI-BASE-NEXT: fmov w9, s3
-; CHECK-GI-BASE-NEXT: fmov w10, s4
-; CHECK-GI-BASE-NEXT: fmov w11, s5
-; CHECK-GI-BASE-NEXT: add w8, w8, w9
-; CHECK-GI-BASE-NEXT: fmov w9, s0
-; CHECK-GI-BASE-NEXT: add w10, w10, w11
-; CHECK-GI-BASE-NEXT: fmov w11, s1
-; CHECK-GI-BASE-NEXT: add w8, w8, w9
-; CHECK-GI-BASE-NEXT: add w9, w10, w11
-; CHECK-GI-BASE-NEXT: add w0, w8, w9
+; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
@@ -2500,91 +2466,71 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-BASE-LABEL: test_sdot_v48i8:
; CHECK-GI-BASE: // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT: ldp q0, q3, [x1]
-; CHECK-GI-BASE-NEXT: ldr q6, [x0, #32]
-; CHECK-GI-BASE-NEXT: ldp q1, q2, [x0]
-; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32]
-; CHECK-GI-BASE-NEXT: sshll v20.8h, v6.8b, #0
-; CHECK-GI-BASE-NEXT: sshll2 v6.8h, v6.16b, #0
-; CHECK-GI-BASE-NEXT: sshll v4.8h, v0.8b, #0
-; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT: sshll v16.8h, v3.8b, #0
-; CHECK-GI-BASE-NEXT: sshll v5.8h, v1.8b, #0
+; CHECK-GI-BASE-NEXT: mov v0.s[0], wzr
+; CHECK-GI-BASE-NEXT: ldp q1, q5, [x1]
+; CHECK-GI-BASE-NEXT: ldp q2, q3, [x0]
+; CHECK-GI-BASE-NEXT: ldr q4, [x0, #32]
+; CHECK-GI-BASE-NEXT: ldr q6, [x1, #32]
+; CHECK-GI-BASE-NEXT: sshll v7.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT: sshll v17.8h, v2.8b, #0
-; CHECK-GI-BASE-NEXT: sshll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT: sshll v16.8h, v5.8b, #0
+; CHECK-GI-BASE-NEXT: mov v0.s[1], wzr
+; CHECK-GI-BASE-NEXT: sshll2 v5.8h, v5.16b, #0
+; CHECK-GI-BASE-NEXT: sshll v17.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT: sshll2 v6.8h, v6.16b, #0
+; CHECK-GI-BASE-NEXT: sshll v18.8h, v2.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v2.8h, v2.16b, #0
-; CHECK-GI-BASE-NEXT: smull v18.4s, v4.4h, v5.4h
-; CHECK-GI-BASE-NEXT: smull2 v4.4s, v4.8h, v5.8h
-; CHECK-GI-BASE-NEXT: smull v5.4s, v0.4h, v1.4h
-; CHECK-GI-BASE-NEXT: smull2 v0.4s, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT: smull v19.4s, v16.4h, v17.4h
-; CHECK-GI-BASE-NEXT: sshll v1.8h, v7.8b, #0
-; CHECK-GI-BASE-NEXT: smull2 v16.4s, v16.8h, v17.8h
-; CHECK-GI-BASE-NEXT: smull v17.4s, v3.4h, v2.4h
-; CHECK-GI-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT: sshll2 v7.8h, v7.16b, #0
-; CHECK-GI-BASE-NEXT: addv s18, v18.4s
-; CHECK-GI-BASE-NEXT: addv s4, v4.4s
-; CHECK-GI-BASE-NEXT: addv s5, v5.4s
-; CHECK-GI-BASE-NEXT: addv s0, v0.4s
-; CHECK-GI-BASE-NEXT: addv s19, v19.4s
-; CHECK-GI-BASE-NEXT: smull v3.4s, v1.4h, v20.4h
-; CHECK-GI-BASE-NEXT: addv s2, v2.4s
-; CHECK-GI-BASE-NEXT: smull2 v1.4s, v1.8h, v20.8h
-; CHECK-GI-BASE-NEXT: smull v20.4s, v7.4h, v6.4h
-; CHECK-GI-BASE-NEXT: fmov w8, s18
-; CHECK-GI-BASE-NEXT: fmov w9, s4
-; CHECK-GI-BASE-NEXT: fmov w10, s5
-; CHECK-GI-BASE-NEXT: fmov w11, s0
-; CHECK-GI-BASE-NEXT: fmov w12, s19
-; CHECK-GI-BASE-NEXT: addv s4, v16.4s
-; CHECK-GI-BASE-NEXT: addv s5, v17.4s
-; CHECK-GI-BASE-NEXT: addv s3, v3.4s
-; CHECK-GI-BASE-NEXT: smull2 v0.4s, v7.8h, v6.8h
-; CHECK-GI-BASE-NEXT: add w8, w8, w9
-; CHECK-GI-BASE-NEXT: addv s1, v1.4s
-; CHECK-GI-BASE-NEXT: add w9, w11, w12
-; CHECK-GI-BASE-NEXT: add w8, w8, w10
-; CHECK-GI-BASE-NEXT: fmov w10, s4
-; CHECK-GI-BASE-NEXT: fmov w11, s5
-; CHECK-GI-BASE-NEXT: fmov w12, s2
-; CHECK-GI-BASE-NEXT: addv s4, v20.4s
+; CHECK-GI-BASE-NEXT: sshll v19.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT: sshll2 v3.8h, v3.16b, #0
+; CHECK-GI-BASE-NEXT: sshll v20.8h, v4.8b, #0
+; CHECK-GI-BASE-NEXT: sshll2 v4.8h, v4.16b, #0
+; CHECK-GI-BASE-NEXT: mov v0.s[2], wzr
+; CHECK-GI-BASE-NEXT: smull v21.4s, v7.4h, v18.4h
+; CHECK-GI-BASE-NEXT: smull v22.4s, v1.4h, v2.4h
+; CHECK-GI-BASE-NEXT: smull v23.4s, v16.4h, v19.4h
+; CHECK-GI-BASE-NEXT: smull v24.4s, v5.4h, v3.4h
+; CHECK-GI-BASE-NEXT: smull v25.4s, v17.4h, v20.4h
+; CHECK-GI-BASE-NEXT: smull v26.4s, v6.4h, v4.4h
+; CHECK-GI-BASE-NEXT: mov v0.s[3], wzr
+; CHECK-GI-BASE-NEXT: smlal2 v21.4s, v7.8h, v18.8h
+; CHECK-GI-BASE-NEXT: smlal2 v22.4s, v1.8h, v2.8h
+; CHECK-GI-BASE-NEXT: smlal2 v23.4s, v16.8h, v19.8h
+; CHECK-GI-BASE-NEXT: smlal2 v24.4s, v5.8h, v3.8h
+; CHECK-GI-BASE-NEXT: smlal2 v25.4s, v17.8h, v20.8h
+; CHECK-GI-BASE-NEXT: smlal2 v26.4s, v6.8h, v4.8h
+; CHECK-GI-BASE-NEXT: add v0.4s, v0.4s, v0.4s
+; CHECK-GI-BASE-NEXT: add v1.4s, v21.4s, v22.4s
+; CHECK-GI-BASE-NEXT: add v2.4s, v23.4s, v24.4s
+; CHECK-GI-BASE-NEXT: add v3.4s, v25.4s, v26.4s
+; CHECK-GI-BASE-NEXT: add v0.4s, v0.4s, v0.4s
+; CHECK-GI-BASE-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-GI-BASE-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
-; CHECK-GI-BASE-NEXT: add w9, w9, w10
-; CHECK-GI-BASE-NEXT: add w10, w11, w12
-; CHECK-GI-BASE-NEXT: fmov w11, s3
-; CHECK-GI-BASE-NEXT: add w8, w8, w9
-; CHECK-GI-BASE-NEXT: add w9, w10, w11
-; CHECK-GI-BASE-NEXT: fmov w10, s1
-; CHECK-GI-BASE-NEXT: fmov w11, s0
-; CHECK-GI-BASE-NEXT: add w9, w9, w10
-; CHECK-GI-BASE-NEXT: fmov w10, s4
-; CHECK-GI-BASE-NEXT: add w8, w8, w9
-; CHECK-GI-BASE-NEXT: add w9, w10, w11
-; CHECK-GI-BASE-NEXT: add w0, w8, w9
+; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
; CHECK-GI-DOT: // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: mov v0.s[0], wzr
; CHECK-GI-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr q7, [x0, #32]
-; CHECK-GI-DOT-NEXT: ldp q3, q4, [x0]
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
-; CHECK-GI-DOT-NEXT: ldp q5, q6, [x1]
-; CHECK-GI-DOT-NEXT: ldr q16, [x1, #32]
-; CHECK-GI-DOT-NEXT: sdot v0.4s, v5.16b, v3.16b
+; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
+; CHECK-GI-DOT-NEXT: ldr q17, [x1, #32]
+; CHECK-GI-DOT-NEXT: ldp q4, q5, [x0]
+; CHECK-GI-DOT-NEXT: ldp q6, q16, [x1]
+; CHECK-GI-DOT-NEXT: mov v0.s[1], wzr
+; CHECK-GI-DOT-NEXT: sdot v2.4s, v17.16b, v7.16b
; CHECK-GI-DOT-NEXT: sdot v1.4s, v6.16b, v4.16b
-; CHECK-GI-DOT-NEXT: sdot v2.4s, v16.16b, v7.16b
+; CHECK-GI-DOT-NEXT: sdot v3.4s, v16.16b, v5.16b
+; CHECK-GI-DOT-NEXT: mov v0.s[2], wzr
+; CHECK-GI-DOT-NEXT: add v1.4s, v1.4s, v3.4s
+; CHECK-GI-DOT-NEXT: mov v0.s[3], wzr
+; CHECK-GI-DOT-NEXT: add v0.4s, v2.4s, v0.4s
+; CHECK-GI-DOT-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
-; CHECK-GI-DOT-NEXT: addv s1, v1.4s
-; CHECK-GI-DOT-NEXT: addv s2, v2.4s
-; CHECK-GI-DOT-NEXT: fmov w8, s0
-; CHECK-GI-DOT-NEXT: fmov w9, s1
-; CHECK-GI-DOT-NEXT: add w8, w8, w9
-; CHECK-GI-DOT-NEXT: fmov w9, s2
-; CHECK-GI-DOT-NEXT: add w0, w8, w9
+; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <48 x i8>, ptr %p1
More information about the llvm-commits
mailing list