[llvm] a35640f - [AArch64] Extend vecreduce to udot/sdot transformation to support usdot (#120094)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 23 04:34:50 PST 2024
Author: Igor Kirillov
Date: 2024-12-23T12:34:46Z
New Revision: a35640f29e82dffbe87fb75af9b50c6e1312b455
URL: https://github.com/llvm/llvm-project/commit/a35640f29e82dffbe87fb75af9b50c6e1312b455
DIFF: https://github.com/llvm/llvm-project/commit/a35640f29e82dffbe87fb75af9b50c6e1312b455.diff
LOG: [AArch64] Extend vecreduce to udot/sdot transformation to support usdot (#120094)
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/neon-dotreduce.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 505fae4e840f7e..e455fabfe2e8d9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18177,16 +18177,38 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
unsigned ExtOpcode = Op0.getOpcode();
SDValue A = Op0;
SDValue B;
+ unsigned DotOpcode;
if (ExtOpcode == ISD::MUL) {
A = Op0.getOperand(0);
B = Op0.getOperand(1);
- if (A.getOpcode() != B.getOpcode() ||
- A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
+ if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
return SDValue();
- ExtOpcode = A.getOpcode();
- }
- if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
+ auto OpCodeA = A.getOpcode();
+ if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
+ return SDValue();
+
+ auto OpCodeB = B.getOpcode();
+ if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
+ return SDValue();
+
+ if (OpCodeA == OpCodeB) {
+ DotOpcode =
+ OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
+ } else {
+ // Check USDOT support support
+ if (!ST->hasMatMulInt8())
+ return SDValue();
+ DotOpcode = AArch64ISD::USDOT;
+ if (OpCodeA == ISD::SIGN_EXTEND)
+ std::swap(A, B);
+ }
+ } else if (ExtOpcode == ISD::ZERO_EXTEND) {
+ DotOpcode = AArch64ISD::UDOT;
+ } else if (ExtOpcode == ISD::SIGN_EXTEND) {
+ DotOpcode = AArch64ISD::SDOT;
+ } else {
return SDValue();
+ }
EVT Op0VT = A.getOperand(0).getValueType();
bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
@@ -18212,8 +18234,6 @@ static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
TargetType = MVT::v2i32;
}
- auto DotOpcode =
- (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
// Handle the case where we need to generate only one Dot operation.
if (NumOfVecReduce == 1) {
SDValue Zeros = DAG.getConstant(0, DL, TargetType);
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index c345c1e50bbbb7..748555d7bdfa15 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod -global-isel -global-isel-abort=2 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm -global-isel -global-isel-abort=2 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
; CHECK-GI: warning: Instruction selection used fallback path for test_udot_v5i8
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_udot_v5i8_nomla
@@ -290,6 +290,128 @@ entry:
ret i32 %x
}
+define i32 @test_usdot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_usdot_v4i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ldr s0, [x0]
+; CHECK-SD-NEXT: ldr s1, [x1]
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT: smull v0.4s, v1.4h, v0.4h
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w8, s0
+; CHECK-SD-NEXT: add w0, w8, w2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_v4i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr w8, [x0]
+; CHECK-GI-NEXT: ldr w9, [x1]
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: uxtb w8, w8
+; CHECK-GI-NEXT: sxtb w9, w9
+; CHECK-GI-NEXT: mov b1, v0.b[1]
+; CHECK-GI-NEXT: mov b3, v0.b[2]
+; CHECK-GI-NEXT: mov b5, v2.b[2]
+; CHECK-GI-NEXT: mov b4, v0.b[3]
+; CHECK-GI-NEXT: mov b0, v2.b[1]
+; CHECK-GI-NEXT: mov b6, v2.b[3]
+; CHECK-GI-NEXT: fmov s2, w9
+; CHECK-GI-NEXT: fmov w10, s1
+; CHECK-GI-NEXT: fmov w11, s3
+; CHECK-GI-NEXT: fmov s1, w8
+; CHECK-GI-NEXT: fmov w13, s5
+; CHECK-GI-NEXT: fmov w8, s4
+; CHECK-GI-NEXT: fmov w12, s0
+; CHECK-GI-NEXT: uxtb w10, w10
+; CHECK-GI-NEXT: uxtb w11, w11
+; CHECK-GI-NEXT: sxtb w13, w13
+; CHECK-GI-NEXT: uxtb w8, w8
+; CHECK-GI-NEXT: sxtb w12, w12
+; CHECK-GI-NEXT: mov v1.h[1], w10
+; CHECK-GI-NEXT: fmov w10, s6
+; CHECK-GI-NEXT: fmov s0, w11
+; CHECK-GI-NEXT: fmov s3, w13
+; CHECK-GI-NEXT: mov v2.h[1], w12
+; CHECK-GI-NEXT: sxtb w10, w10
+; CHECK-GI-NEXT: mov v0.h[1], w8
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mov v3.h[1], w10
+; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: mov v1.d[1], v0.d[0]
+; CHECK-GI-NEXT: mov v2.d[1], v3.d[0]
+; CHECK-GI-NEXT: mul v0.4s, v2.4s, v1.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ret
+entry:
+ %0 = load <4 x i8>, ptr %a
+ %1 = zext <4 x i8> %0 to <4 x i32>
+ %2 = load <4 x i8>, ptr %b
+ %3 = sext <4 x i8> %2 to <4 x i32>
+ %4 = mul nsw <4 x i32> %3, %1
+ %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
+ %op.extra = add nsw i32 %5, %sum
+ ret i32 %op.extra
+}
+
+define i32 @test_usdot_v4i8_double(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_v4i8_double:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-SD-NEXT: bic v2.4h, #255, lsl #8
+; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT: shl v3.4s, v3.4s, #24
+; CHECK-SD-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-SD-NEXT: shl v1.4s, v1.4s, #24
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: sshr v3.4s, v3.4s, #24
+; CHECK-SD-NEXT: sshr v1.4s, v1.4s, #24
+; CHECK-SD-NEXT: mul v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT: mla v2.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: addv s0, v2.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_v4i8_double:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: movi v4.2d, #0x0000ff000000ff
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: shl v1.4s, v1.4s, #24
+; CHECK-GI-NEXT: shl v3.4s, v3.4s, #24
+; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-GI-NEXT: and v2.16b, v2.16b, v4.16b
+; CHECK-GI-NEXT: sshr v1.4s, v1.4s, #24
+; CHECK-GI-NEXT: sshr v3.4s, v3.4s, #24
+; CHECK-GI-NEXT: mul v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: mul v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: addv s1, v1.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ret
+entry:
+ %az = zext <4 x i8> %a to <4 x i32>
+ %bz = sext <4 x i8> %b to <4 x i32>
+ %m1 = mul nuw nsw <4 x i32> %az, %bz
+ %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m1)
+ %cz = zext <4 x i8> %c to <4 x i32>
+ %dz = sext <4 x i8> %d to <4 x i32>
+ %m2 = mul nuw nsw <4 x i32> %cz, %dz
+ %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m2)
+ %x = add i32 %r1, %r2
+ ret i32 %x
+}
+
define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
; CHECK-LABEL: test_udot_v5i8:
; CHECK: // %bb.0: // %entry
@@ -508,6 +630,77 @@ entry:
ret i32 %2
}
+define i32 @test_usdot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
+; CHECK-SD-LABEL: test_usdot_v8i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: ldr d1, [x0]
+; CHECK-SD-NEXT: ldr d2, [x1]
+; CHECK-SD-NEXT: usdot v0.2s, v1.8b, v2.8b
+; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_v8i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v2.4s, v0.8h, #0
+; CHECK-GI-NEXT: sshll2 v3.4s, v1.8h, #0
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mul v2.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: addv s0, v2.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
+entry:
+ %0 = load <8 x i8>, ptr %a
+ %1 = zext <8 x i8> %0 to <8 x i32>
+ %2 = load <8 x i8>, ptr %b
+ %3 = sext <8 x i8> %2 to <8 x i32>
+ %4 = mul nsw <8 x i32> %3, %1
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ ret i32 %5
+}
+
+define i32 @test_usdot_swapped_operands_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
+; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: ldr d1, [x0]
+; CHECK-SD-NEXT: ldr d2, [x1]
+; CHECK-SD-NEXT: usdot v0.2s, v2.8b, v1.8b
+; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_swapped_operands_v8i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr d0, [x0]
+; CHECK-GI-NEXT: ldr d1, [x1]
+; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT: sshll2 v2.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll2 v3.4s, v1.8h, #0
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mul v2.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: addv s0, v2.4s
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
+entry:
+ %0 = load <8 x i8>, ptr %a
+ %1 = sext <8 x i8> %0 to <8 x i32>
+ %2 = load <8 x i8>, ptr %b
+ %3 = zext <8 x i8> %2 to <8 x i32>
+ %4 = mul nsw <8 x i32> %3, %1
+ %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
+ ret i32 %5
+}
define i32 @test_udot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
; CHECK-LABEL: test_udot_v16i8:
@@ -587,6 +780,101 @@ entry:
ret i32 %2
}
+define i32 @test_usdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_usdot_v16i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: ldr q1, [x0]
+; CHECK-SD-NEXT: ldr q2, [x1]
+; CHECK-SD-NEXT: usdot v0.4s, v1.16b, v2.16b
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w8, s0
+; CHECK-SD-NEXT: add w0, w8, w2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_v16i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: sshll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: ushll2 v4.4s, v2.8h, #0
+; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0
+; CHECK-GI-NEXT: sshll2 v6.4s, v3.8h, #0
+; CHECK-GI-NEXT: sshll2 v7.4s, v1.8h, #0
+; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mul v4.4s, v6.4s, v4.4s
+; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s
+; CHECK-GI-NEXT: mla v4.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: add v0.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ret
+entry:
+ %0 = load <16 x i8>, ptr %a
+ %1 = zext <16 x i8> %0 to <16 x i32>
+ %2 = load <16 x i8>, ptr %b
+ %3 = sext <16 x i8> %2 to <16 x i32>
+ %4 = mul nsw <16 x i32> %3, %1
+ %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
+ %op.extra = add nsw i32 %5, %sum
+ ret i32 %op.extra
+}
+
+define i32 @test_usdot_swapped_operands_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: ldr q1, [x0]
+; CHECK-SD-NEXT: ldr q2, [x1]
+; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v1.16b
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w8, s0
+; CHECK-SD-NEXT: add w0, w8, w2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_swapped_operands_v16i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldr q0, [x0]
+; CHECK-GI-NEXT: ldr q1, [x1]
+; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: sshll2 v4.4s, v2.8h, #0
+; CHECK-GI-NEXT: sshll2 v5.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll2 v6.4s, v3.8h, #0
+; CHECK-GI-NEXT: ushll2 v7.4s, v1.8h, #0
+; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mul v4.4s, v6.4s, v4.4s
+; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s
+; CHECK-GI-NEXT: mla v4.4s, v3.4s, v2.4s
+; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: add v0.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ret
+entry:
+ %0 = load <16 x i8>, ptr %a
+ %1 = sext <16 x i8> %0 to <16 x i32>
+ %2 = load <16 x i8>, ptr %b
+ %3 = zext <16 x i8> %2 to <16 x i32>
+ %4 = mul nsw <16 x i32> %3, %1
+ %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
+ %op.extra = add nsw i32 %5, %sum
+ ret i32 %op.extra
+}
define i32 @test_udot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; CHECK-SD-LABEL: test_udot_v8i8_double:
@@ -860,6 +1148,240 @@ entry:
ret i32 %x
}
+
+define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_v8i8_double:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT: usdot v5.2s, v0.8b, v1.8b
+; CHECK-SD-NEXT: usdot v4.2s, v2.8b, v3.8b
+; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s
+; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_v8i8_double:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT: sshll v3.8h, v3.8b, #0
+; CHECK-GI-NEXT: ushll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT: sshll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT: ushll2 v6.4s, v2.8h, #0
+; CHECK-GI-NEXT: sshll2 v7.4s, v3.8h, #0
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: mul v4.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: mul v5.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT: mla v4.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: mla v5.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: addv s0, v4.4s
+; CHECK-GI-NEXT: addv s1, v5.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ret
+entry:
+ %az = zext <8 x i8> %a to <8 x i32>
+ %bz = sext <8 x i8> %b to <8 x i32>
+ %m1 = mul nuw nsw <8 x i32> %az, %bz
+ %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
+ %cz = zext <8 x i8> %c to <8 x i32>
+ %dz = sext <8 x i8> %d to <8 x i32>
+ %m2 = mul nuw nsw <8 x i32> %cz, %dz
+ %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
+ %x = add i32 %r1, %r2
+ ret i32 %x
+}
+
+define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8_double:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT: usdot v5.2s, v1.8b, v0.8b
+; CHECK-SD-NEXT: usdot v4.2s, v3.8b, v2.8b
+; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s
+; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_swapped_operands_v8i8_double:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT: sshll v2.8h, v2.8b, #0
+; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0
+; CHECK-GI-NEXT: sshll2 v4.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll2 v5.4s, v1.8h, #0
+; CHECK-GI-NEXT: sshll2 v6.4s, v2.8h, #0
+; CHECK-GI-NEXT: ushll2 v7.4s, v3.8h, #0
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: mul v4.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: mul v5.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT: mla v4.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: mla v5.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: addv s0, v4.4s
+; CHECK-GI-NEXT: addv s1, v5.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ret
+entry:
+ %az = sext <8 x i8> %a to <8 x i32>
+ %bz = zext <8 x i8> %b to <8 x i32>
+ %m1 = mul nuw nsw <8 x i32> %az, %bz
+ %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
+ %cz = sext <8 x i8> %c to <8 x i32>
+ %dz = zext <8 x i8> %d to <8 x i32>
+ %m2 = mul nuw nsw <8 x i32> %cz, %dz
+ %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
+ %x = add i32 %r1, %r2
+ ret i32 %x
+}
+
+define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_v16i8_double:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT: usdot v5.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT: usdot v4.4s, v2.16b, v3.16b
+; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_v16i8_double:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ushll v4.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: sshll v5.8h, v1.8b, #0
+; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: ushll v6.8h, v2.8b, #0
+; CHECK-GI-NEXT: sshll v7.8h, v3.8b, #0
+; CHECK-GI-NEXT: ushll2 v2.8h, v2.16b, #0
+; CHECK-GI-NEXT: sshll2 v3.8h, v3.16b, #0
+; CHECK-GI-NEXT: ushll2 v16.4s, v4.8h, #0
+; CHECK-GI-NEXT: ushll2 v17.4s, v0.8h, #0
+; CHECK-GI-NEXT: sshll2 v18.4s, v5.8h, #0
+; CHECK-GI-NEXT: sshll2 v19.4s, v1.8h, #0
+; CHECK-GI-NEXT: ushll2 v20.4s, v6.8h, #0
+; CHECK-GI-NEXT: sshll2 v21.4s, v7.8h, #0
+; CHECK-GI-NEXT: ushll2 v22.4s, v2.8h, #0
+; CHECK-GI-NEXT: sshll2 v23.4s, v3.8h, #0
+; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT: mul v16.4s, v16.4s, v18.4s
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: sshll v5.4s, v5.4h, #0
+; CHECK-GI-NEXT: mul v17.4s, v17.4s, v19.4s
+; CHECK-GI-NEXT: mul v18.4s, v20.4s, v21.4s
+; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mul v19.4s, v22.4s, v23.4s
+; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0
+; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: sshll v7.4s, v7.4h, #0
+; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: mla v16.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: mla v17.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: mla v18.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT: mla v19.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s
+; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: addv s1, v1.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ret
+entry:
+ %az = zext <16 x i8> %a to <16 x i32>
+ %bz = sext <16 x i8> %b to <16 x i32>
+ %m1 = mul nuw nsw <16 x i32> %az, %bz
+ %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
+ %cz = zext <16 x i8> %c to <16 x i32>
+ %dz = sext <16 x i8> %d to <16 x i32>
+ %m2 = mul nuw nsw <16 x i32> %cz, %dz
+ %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
+ %x = add i32 %r1, %r2
+ ret i32 %x
+}
+
+
+define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8_double:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v0.16b
+; CHECK-SD-NEXT: usdot v4.4s, v3.16b, v2.16b
+; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_swapped_operands_v16i8_double:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sshll v4.8h, v0.8b, #0
+; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: ushll v5.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: sshll v6.8h, v2.8b, #0
+; CHECK-GI-NEXT: ushll v7.8h, v3.8b, #0
+; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0
+; CHECK-GI-NEXT: ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-NEXT: sshll2 v16.4s, v4.8h, #0
+; CHECK-GI-NEXT: sshll2 v17.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll2 v18.4s, v5.8h, #0
+; CHECK-GI-NEXT: ushll2 v19.4s, v1.8h, #0
+; CHECK-GI-NEXT: sshll2 v20.4s, v6.8h, #0
+; CHECK-GI-NEXT: ushll2 v21.4s, v7.8h, #0
+; CHECK-GI-NEXT: sshll2 v22.4s, v2.8h, #0
+; CHECK-GI-NEXT: ushll2 v23.4s, v3.8h, #0
+; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT: mul v16.4s, v16.4s, v18.4s
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0
+; CHECK-GI-NEXT: mul v17.4s, v17.4s, v19.4s
+; CHECK-GI-NEXT: mul v18.4s, v20.4s, v21.4s
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mul v19.4s, v22.4s, v23.4s
+; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0
+; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0
+; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: mla v16.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: mla v17.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: mla v18.4s, v6.4s, v7.4s
+; CHECK-GI-NEXT: mla v19.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s
+; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: addv s1, v1.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ret
+entry:
+ %az = sext <16 x i8> %a to <16 x i32>
+ %bz = zext <16 x i8> %b to <16 x i32>
+ %m1 = mul nuw nsw <16 x i32> %az, %bz
+ %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
+ %cz = sext <16 x i8> %c to <16 x i32>
+ %dz = zext <16 x i8> %d to <16 x i32>
+ %m2 = mul nuw nsw <16 x i32> %cz, %dz
+ %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
+ %x = add i32 %r1, %r2
+ ret i32 %x
+}
+
define i32 @test_udot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
; CHECK-SD-LABEL: test_udot_v24i8:
; CHECK-SD: // %bb.0: // %entry
@@ -1658,7 +2180,6 @@ entry:
ret i32 %x
}
-
define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
; CHECK-LABEL: test_udot_v25i8:
; CHECK: // %bb.0: // %entry
@@ -2301,6 +2822,202 @@ entry:
ret i32 %x
}
+define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_usdot_v32i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT: ldp q2, q3, [x0]
+; CHECK-SD-NEXT: ldp q4, q5, [x1]
+; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b
+; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w8, s0
+; CHECK-SD-NEXT: add w0, w8, w2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_v32i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: ldp q0, q1, [x1]
+; CHECK-GI-NEXT: ldp q2, q3, [x0]
+; CHECK-GI-NEXT: sshll v4.8h, v0.8b, #0
+; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: sshll v5.8h, v1.8b, #0
+; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: ushll v6.8h, v2.8b, #0
+; CHECK-GI-NEXT: ushll2 v2.8h, v2.16b, #0
+; CHECK-GI-NEXT: ushll v7.8h, v3.8b, #0
+; CHECK-GI-NEXT: ushll2 v3.8h, v3.16b, #0
+; CHECK-GI-NEXT: sshll2 v16.4s, v4.8h, #0
+; CHECK-GI-NEXT: sshll2 v17.4s, v0.8h, #0
+; CHECK-GI-NEXT: sshll2 v18.4s, v5.8h, #0
+; CHECK-GI-NEXT: sshll2 v19.4s, v1.8h, #0
+; CHECK-GI-NEXT: ushll2 v20.4s, v6.8h, #0
+; CHECK-GI-NEXT: ushll2 v21.4s, v2.8h, #0
+; CHECK-GI-NEXT: ushll2 v22.4s, v7.8h, #0
+; CHECK-GI-NEXT: ushll2 v23.4s, v3.8h, #0
+; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: sshll v5.4s, v5.4h, #0
+; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: mul v16.4s, v16.4s, v20.4s
+; CHECK-GI-NEXT: mul v17.4s, v17.4s, v21.4s
+; CHECK-GI-NEXT: ushll v6.4s, v6.4h, #0
+; CHECK-GI-NEXT: mul v18.4s, v18.4s, v22.4s
+; CHECK-GI-NEXT: mul v19.4s, v19.4s, v23.4s
+; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: ushll v7.4s, v7.4h, #0
+; CHECK-GI-NEXT: ushll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: mla v16.4s, v4.4s, v6.4s
+; CHECK-GI-NEXT: mla v17.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT: mla v18.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT: mla v19.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT: add v0.4s, v16.4s, v17.4s
+; CHECK-GI-NEXT: add v1.4s, v18.4s, v19.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ret
+entry:
+ %0 = load <32 x i8>, ptr %a
+ %1 = zext <32 x i8> %0 to <32 x i32>
+ %2 = load <32 x i8>, ptr %b
+ %3 = sext <32 x i8> %2 to <32 x i32>
+ %4 = mul nsw <32 x i32> %3, %1
+ %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
+ %op.extra = add nsw i32 %5, %sum
+ ret i32 %op.extra
+}
+
+define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_v32i8_double:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
+; CHECK-SD-NEXT: usdot v16.4s, v1.16b, v3.16b
+; CHECK-SD-NEXT: usdot v18.4s, v0.16b, v2.16b
+; CHECK-SD-NEXT: usdot v17.4s, v4.16b, v6.16b
+; CHECK-SD-NEXT: usdot v19.4s, v5.16b, v7.16b
+; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s
+; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_v32i8_double:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset b8, -8
+; CHECK-GI-NEXT: .cfi_offset b9, -16
+; CHECK-GI-NEXT: .cfi_offset b10, -24
+; CHECK-GI-NEXT: .cfi_offset b11, -32
+; CHECK-GI-NEXT: .cfi_offset b12, -40
+; CHECK-GI-NEXT: .cfi_offset b13, -48
+; CHECK-GI-NEXT: .cfi_offset b14, -56
+; CHECK-GI-NEXT: .cfi_offset b15, -64
+; CHECK-GI-NEXT: ushll v16.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: ushll v17.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-GI-NEXT: sshll v18.8h, v2.8b, #0
+; CHECK-GI-NEXT: sshll2 v2.8h, v2.16b, #0
+; CHECK-GI-NEXT: sshll v19.8h, v3.8b, #0
+; CHECK-GI-NEXT: sshll2 v3.8h, v3.16b, #0
+; CHECK-GI-NEXT: ushll v27.8h, v4.8b, #0
+; CHECK-GI-NEXT: ushll2 v4.8h, v4.16b, #0
+; CHECK-GI-NEXT: ushll v28.8h, v5.8b, #0
+; CHECK-GI-NEXT: sshll v29.8h, v6.8b, #0
+; CHECK-GI-NEXT: sshll2 v6.8h, v6.16b, #0
+; CHECK-GI-NEXT: ushll2 v5.8h, v5.16b, #0
+; CHECK-GI-NEXT: sshll v30.8h, v7.8b, #0
+; CHECK-GI-NEXT: sshll2 v7.8h, v7.16b, #0
+; CHECK-GI-NEXT: ushll2 v20.4s, v16.8h, #0
+; CHECK-GI-NEXT: ushll2 v21.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll2 v22.4s, v17.8h, #0
+; CHECK-GI-NEXT: ushll2 v23.4s, v1.8h, #0
+; CHECK-GI-NEXT: sshll2 v24.4s, v18.8h, #0
+; CHECK-GI-NEXT: sshll2 v25.4s, v2.8h, #0
+; CHECK-GI-NEXT: sshll2 v26.4s, v19.8h, #0
+; CHECK-GI-NEXT: sshll2 v31.4s, v3.8h, #0
+; CHECK-GI-NEXT: ushll2 v8.4s, v27.8h, #0
+; CHECK-GI-NEXT: ushll2 v9.4s, v4.8h, #0
+; CHECK-GI-NEXT: ushll2 v10.4s, v28.8h, #0
+; CHECK-GI-NEXT: sshll2 v11.4s, v29.8h, #0
+; CHECK-GI-NEXT: sshll2 v12.4s, v6.8h, #0
+; CHECK-GI-NEXT: ushll2 v13.4s, v5.8h, #0
+; CHECK-GI-NEXT: sshll2 v14.4s, v30.8h, #0
+; CHECK-GI-NEXT: sshll2 v15.4s, v7.8h, #0
+; CHECK-GI-NEXT: mul v20.4s, v20.4s, v24.4s
+; CHECK-GI-NEXT: mul v21.4s, v21.4s, v25.4s
+; CHECK-GI-NEXT: mul v22.4s, v22.4s, v26.4s
+; CHECK-GI-NEXT: mul v23.4s, v23.4s, v31.4s
+; CHECK-GI-NEXT: mul v24.4s, v8.4s, v11.4s
+; CHECK-GI-NEXT: mul v25.4s, v9.4s, v12.4s
+; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mul v26.4s, v10.4s, v14.4s
+; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mul v31.4s, v13.4s, v15.4s
+; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: ushll v17.4s, v17.4h, #0
+; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0
+; CHECK-GI-NEXT: sshll v18.4s, v18.4h, #0
+; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0
+; CHECK-GI-NEXT: sshll v19.4s, v19.4h, #0
+; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0
+; CHECK-GI-NEXT: ushll v27.4s, v27.4h, #0
+; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT: ushll v28.4s, v28.4h, #0
+; CHECK-GI-NEXT: ushll v5.4s, v5.4h, #0
+; CHECK-GI-NEXT: sshll v29.4s, v29.4h, #0
+; CHECK-GI-NEXT: sshll v6.4s, v6.4h, #0
+; CHECK-GI-NEXT: sshll v30.4s, v30.4h, #0
+; CHECK-GI-NEXT: sshll v7.4s, v7.4h, #0
+; CHECK-GI-NEXT: mla v20.4s, v16.4s, v18.4s
+; CHECK-GI-NEXT: mla v21.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT: mla v22.4s, v17.4s, v19.4s
+; CHECK-GI-NEXT: mla v23.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT: mla v24.4s, v27.4s, v29.4s
+; CHECK-GI-NEXT: mla v25.4s, v4.4s, v6.4s
+; CHECK-GI-NEXT: mla v26.4s, v28.4s, v30.4s
+; CHECK-GI-NEXT: mla v31.4s, v5.4s, v7.4s
+; CHECK-GI-NEXT: add v0.4s, v20.4s, v21.4s
+; CHECK-GI-NEXT: add v1.4s, v22.4s, v23.4s
+; CHECK-GI-NEXT: add v2.4s, v24.4s, v25.4s
+; CHECK-GI-NEXT: add v3.4s, v26.4s, v31.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: addv s1, v1.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: fmov w9, s1
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ret
+entry:
+ %az = zext <32 x i8> %a to <32 x i32>
+ %bz = sext <32 x i8> %b to <32 x i32>
+ %m1 = mul nuw nsw <32 x i32> %az, %bz
+ %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m1)
+ %cz = zext <32 x i8> %c to <32 x i32>
+ %dz = sext <32 x i8> %d to <32 x i32>
+ %m2 = mul nuw nsw <32 x i32> %cz, %dz
+ %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m2)
+ %x = add i32 %r1, %r2
+ ret i32 %x
+}
+
+
define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
; CHECK-LABEL: test_udot_v33i8:
; CHECK: // %bb.0: // %entry
@@ -2866,6 +3583,7 @@ entry:
%x = add i32 %r1, %r2
ret i32 %x
}
+
define i32 @test_udot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
; CHECK-SD-LABEL: test_udot_v48i8:
; CHECK-SD: // %bb.0: // %entry
@@ -4527,3 +5245,385 @@ entry:
%x = add i32 %r1, %r2
ret i32 %x
}
+
+define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
+; CHECK-SD-LABEL: test_usdot_v64i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT: ldp q1, q2, [x0, #32]
+; CHECK-SD-NEXT: ldp q6, q7, [x1, #32]
+; CHECK-SD-NEXT: ldp q16, q17, [x0]
+; CHECK-SD-NEXT: ldp q18, q19, [x1]
+; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v7.16b
+; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v6.16b
+; CHECK-SD-NEXT: usdot v4.4s, v17.16b, v19.16b
+; CHECK-SD-NEXT: usdot v3.4s, v16.16b, v18.16b
+; CHECK-SD-NEXT: add v0.4s, v4.4s, v0.4s
+; CHECK-SD-NEXT: add v1.4s, v3.4s, v5.4s
+; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w8, s0
+; CHECK-SD-NEXT: add w0, w8, w2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_v64i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset b8, -8
+; CHECK-GI-NEXT: .cfi_offset b9, -16
+; CHECK-GI-NEXT: .cfi_offset b10, -24
+; CHECK-GI-NEXT: .cfi_offset b11, -32
+; CHECK-GI-NEXT: .cfi_offset b12, -40
+; CHECK-GI-NEXT: .cfi_offset b13, -48
+; CHECK-GI-NEXT: .cfi_offset b14, -56
+; CHECK-GI-NEXT: .cfi_offset b15, -64
+; CHECK-GI-NEXT: ldp q0, q1, [x1]
+; CHECK-GI-NEXT: ldp q21, q17, [x0]
+; CHECK-GI-NEXT: ldp q3, q19, [x1, #32]
+; CHECK-GI-NEXT: ldp q18, q4, [x0, #32]
+; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0
+; CHECK-GI-NEXT: sshll2 v5.8h, v0.16b, #0
+; CHECK-GI-NEXT: sshll v7.8h, v1.8b, #0
+; CHECK-GI-NEXT: sshll2 v22.8h, v1.16b, #0
+; CHECK-GI-NEXT: sshll v23.8h, v3.8b, #0
+; CHECK-GI-NEXT: sshll2 v24.8h, v3.16b, #0
+; CHECK-GI-NEXT: sshll v25.8h, v19.8b, #0
+; CHECK-GI-NEXT: sshll2 v26.8h, v19.16b, #0
+; CHECK-GI-NEXT: ushll v27.8h, v21.8b, #0
+; CHECK-GI-NEXT: ushll2 v28.8h, v21.16b, #0
+; CHECK-GI-NEXT: ushll v30.8h, v17.8b, #0
+; CHECK-GI-NEXT: ushll2 v17.8h, v17.16b, #0
+; CHECK-GI-NEXT: ushll v8.8h, v18.8b, #0
+; CHECK-GI-NEXT: ushll2 v18.8h, v18.16b, #0
+; CHECK-GI-NEXT: ushll v9.8h, v4.8b, #0
+; CHECK-GI-NEXT: ushll2 v4.8h, v4.16b, #0
+; CHECK-GI-NEXT: sshll v0.4s, v2.4h, #0
+; CHECK-GI-NEXT: sshll2 v6.4s, v2.8h, #0
+; CHECK-GI-NEXT: sshll v1.4s, v5.4h, #0
+; CHECK-GI-NEXT: sshll2 v16.4s, v5.8h, #0
+; CHECK-GI-NEXT: sshll v2.4s, v7.4h, #0
+; CHECK-GI-NEXT: sshll2 v20.4s, v7.8h, #0
+; CHECK-GI-NEXT: sshll v3.4s, v22.4h, #0
+; CHECK-GI-NEXT: sshll2 v22.4s, v22.8h, #0
+; CHECK-GI-NEXT: sshll v5.4s, v23.4h, #0
+; CHECK-GI-NEXT: sshll2 v23.4s, v23.8h, #0
+; CHECK-GI-NEXT: sshll v7.4s, v24.4h, #0
+; CHECK-GI-NEXT: sshll2 v24.4s, v24.8h, #0
+; CHECK-GI-NEXT: sshll v19.4s, v25.4h, #0
+; CHECK-GI-NEXT: sshll2 v25.4s, v25.8h, #0
+; CHECK-GI-NEXT: sshll v21.4s, v26.4h, #0
+; CHECK-GI-NEXT: sshll2 v26.4s, v26.8h, #0
+; CHECK-GI-NEXT: ushll v29.4s, v27.4h, #0
+; CHECK-GI-NEXT: ushll2 v27.4s, v27.8h, #0
+; CHECK-GI-NEXT: ushll v31.4s, v28.4h, #0
+; CHECK-GI-NEXT: ushll2 v28.4s, v28.8h, #0
+; CHECK-GI-NEXT: ushll v10.4s, v30.4h, #0
+; CHECK-GI-NEXT: ushll2 v30.4s, v30.8h, #0
+; CHECK-GI-NEXT: ushll v11.4s, v17.4h, #0
+; CHECK-GI-NEXT: ushll2 v17.4s, v17.8h, #0
+; CHECK-GI-NEXT: ushll2 v12.4s, v8.8h, #0
+; CHECK-GI-NEXT: ushll2 v13.4s, v18.8h, #0
+; CHECK-GI-NEXT: ushll2 v14.4s, v9.8h, #0
+; CHECK-GI-NEXT: ushll2 v15.4s, v4.8h, #0
+; CHECK-GI-NEXT: mul v6.4s, v6.4s, v27.4s
+; CHECK-GI-NEXT: mul v16.4s, v16.4s, v28.4s
+; CHECK-GI-NEXT: mul v20.4s, v20.4s, v30.4s
+; CHECK-GI-NEXT: mul v17.4s, v22.4s, v17.4s
+; CHECK-GI-NEXT: ushll v8.4s, v8.4h, #0
+; CHECK-GI-NEXT: mul v22.4s, v23.4s, v12.4s
+; CHECK-GI-NEXT: mul v23.4s, v24.4s, v13.4s
+; CHECK-GI-NEXT: mul v24.4s, v25.4s, v14.4s
+; CHECK-GI-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mul v25.4s, v26.4s, v15.4s
+; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0
+; CHECK-GI-NEXT: ushll v26.4s, v9.4h, #0
+; CHECK-GI-NEXT: ushll v4.4s, v4.4h, #0
+; CHECK-GI-NEXT: mla v6.4s, v0.4s, v29.4s
+; CHECK-GI-NEXT: mla v16.4s, v1.4s, v31.4s
+; CHECK-GI-NEXT: mla v20.4s, v2.4s, v10.4s
+; CHECK-GI-NEXT: mla v17.4s, v3.4s, v11.4s
+; CHECK-GI-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mla v22.4s, v5.4s, v8.4s
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mla v23.4s, v7.4s, v18.4s
+; CHECK-GI-NEXT: mla v24.4s, v19.4s, v26.4s
+; CHECK-GI-NEXT: mla v25.4s, v21.4s, v4.4s
+; CHECK-GI-NEXT: add v0.4s, v6.4s, v16.4s
+; CHECK-GI-NEXT: add v1.4s, v20.4s, v17.4s
+; CHECK-GI-NEXT: add v2.4s, v22.4s, v23.4s
+; CHECK-GI-NEXT: add v3.4s, v24.4s, v25.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: add w0, w8, w2
+; CHECK-GI-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-GI-NEXT: ret
+entry:
+ %0 = load <64 x i8>, ptr %a
+ %1 = zext <64 x i8> %0 to <64 x i32>
+ %2 = load <64 x i8>, ptr %b
+ %3 = sext <64 x i8> %2 to <64 x i32>
+ %4 = mul nsw <64 x i32> %3, %1
+ %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4)
+ %op.extra = add nsw i32 %5, %sum
+ ret i32 %op.extra
+}
+
+define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
+; CHECK-SD-LABEL: test_usdot_v64i8_double:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v21.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v22.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v23.2d, #0000000000000000
+; CHECK-SD-NEXT: ldp q16, q17, [sp, #64]
+; CHECK-SD-NEXT: movi v24.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v25.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v26.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v27.2d, #0000000000000000
+; CHECK-SD-NEXT: ldp q19, q20, [sp, #96]
+; CHECK-SD-NEXT: usdot v18.4s, v3.16b, v7.16b
+; CHECK-SD-NEXT: ldp q3, q7, [sp, #32]
+; CHECK-SD-NEXT: usdot v21.4s, v1.16b, v5.16b
+; CHECK-SD-NEXT: ldp q1, q5, [sp]
+; CHECK-SD-NEXT: usdot v22.4s, v2.16b, v6.16b
+; CHECK-SD-NEXT: usdot v23.4s, v0.16b, v4.16b
+; CHECK-SD-NEXT: usdot v24.4s, v7.16b, v20.16b
+; CHECK-SD-NEXT: usdot v27.4s, v3.16b, v19.16b
+; CHECK-SD-NEXT: usdot v26.4s, v5.16b, v17.16b
+; CHECK-SD-NEXT: usdot v25.4s, v1.16b, v16.16b
+; CHECK-SD-NEXT: add v0.4s, v21.4s, v18.4s
+; CHECK-SD-NEXT: add v1.4s, v23.4s, v22.4s
+; CHECK-SD-NEXT: add v2.4s, v26.4s, v24.4s
+; CHECK-SD-NEXT: add v3.4s, v25.4s, v27.4s
+; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: add v1.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_usdot_v64i8_double:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: sub sp, sp, #304
+; CHECK-GI-NEXT: stp d15, d14, [sp, #224] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d13, d12, [sp, #240] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d11, d10, [sp, #256] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp d9, d8, [sp, #272] // 16-byte Folded Spill
+; CHECK-GI-NEXT: str x29, [sp, #288] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 304
+; CHECK-GI-NEXT: .cfi_offset w29, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
+; CHECK-GI-NEXT: .cfi_offset b10, -40
+; CHECK-GI-NEXT: .cfi_offset b11, -48
+; CHECK-GI-NEXT: .cfi_offset b12, -56
+; CHECK-GI-NEXT: .cfi_offset b13, -64
+; CHECK-GI-NEXT: .cfi_offset b14, -72
+; CHECK-GI-NEXT: .cfi_offset b15, -80
+; CHECK-GI-NEXT: ushll v17.8h, v0.8b, #0
+; CHECK-GI-NEXT: ushll2 v0.8h, v0.16b, #0
+; CHECK-GI-NEXT: ldr x29, [sp, #288] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v20.16b, v3.16b
+; CHECK-GI-NEXT: ushll v16.8h, v1.8b, #0
+; CHECK-GI-NEXT: ushll2 v18.8h, v1.16b, #0
+; CHECK-GI-NEXT: ushll v26.8h, v2.8b, #0
+; CHECK-GI-NEXT: ldp q27, q28, [sp, #304]
+; CHECK-GI-NEXT: ushll2 v29.8h, v2.16b, #0
+; CHECK-GI-NEXT: ushll v2.4s, v17.4h, #0
+; CHECK-GI-NEXT: ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT: sshll v8.8h, v4.8b, #0
+; CHECK-GI-NEXT: ldp q23, q21, [sp, #368]
+; CHECK-GI-NEXT: sshll2 v9.8h, v4.16b, #0
+; CHECK-GI-NEXT: sshll2 v11.8h, v5.16b, #0
+; CHECK-GI-NEXT: mov v25.16b, v7.16b
+; CHECK-GI-NEXT: ushll2 v19.4s, v17.8h, #0
+; CHECK-GI-NEXT: stp q1, q2, [sp, #192] // 32-byte Folded Spill
+; CHECK-GI-NEXT: ushll2 v3.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll2 v17.4s, v18.8h, #0
+; CHECK-GI-NEXT: ldp q24, q22, [sp, #336]
+; CHECK-GI-NEXT: sshll v10.8h, v5.8b, #0
+; CHECK-GI-NEXT: sshll v12.8h, v6.8b, #0
+; CHECK-GI-NEXT: sshll2 v13.8h, v6.16b, #0
+; CHECK-GI-NEXT: mov v2.16b, v20.16b
+; CHECK-GI-NEXT: sshll2 v0.4s, v8.8h, #0
+; CHECK-GI-NEXT: sshll2 v4.4s, v9.8h, #0
+; CHECK-GI-NEXT: sshll2 v6.4s, v11.8h, #0
+; CHECK-GI-NEXT: ushll2 v7.4s, v16.8h, #0
+; CHECK-GI-NEXT: ushll2 v31.4s, v29.8h, #0
+; CHECK-GI-NEXT: sshll2 v5.4s, v10.8h, #0
+; CHECK-GI-NEXT: sshll2 v1.4s, v13.8h, #0
+; CHECK-GI-NEXT: ushll2 v30.4s, v26.8h, #0
+; CHECK-GI-NEXT: ushll v14.8h, v2.8b, #0
+; CHECK-GI-NEXT: mul v20.4s, v19.4s, v0.4s
+; CHECK-GI-NEXT: mul v19.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT: sshll v0.8h, v25.8b, #0
+; CHECK-GI-NEXT: mul v4.4s, v17.4s, v6.4s
+; CHECK-GI-NEXT: sshll2 v15.4s, v12.8h, #0
+; CHECK-GI-NEXT: ldp q17, q3, [sp, #400]
+; CHECK-GI-NEXT: mul v5.4s, v7.4s, v5.4s
+; CHECK-GI-NEXT: mul v7.4s, v31.4s, v1.4s
+; CHECK-GI-NEXT: ushll2 v31.8h, v2.16b, #0
+; CHECK-GI-NEXT: sshll2 v25.8h, v25.16b, #0
+; CHECK-GI-NEXT: sshll2 v1.4s, v0.8h, #0
+; CHECK-GI-NEXT: ushll v2.4s, v14.4h, #0
+; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-GI-NEXT: str q3, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT: ushll2 v3.4s, v14.8h, #0
+; CHECK-GI-NEXT: mul v6.4s, v30.4s, v15.4s
+; CHECK-GI-NEXT: str q31, [sp, #160] // 16-byte Folded Spill
+; CHECK-GI-NEXT: ushll v30.4s, v26.4h, #0
+; CHECK-GI-NEXT: sshll v26.4s, v8.4h, #0
+; CHECK-GI-NEXT: ushll v14.8h, v27.8b, #0
+; CHECK-GI-NEXT: ushll v15.4s, v29.4h, #0
+; CHECK-GI-NEXT: sshll v29.4s, v9.4h, #0
+; CHECK-GI-NEXT: mul v1.4s, v3.4s, v1.4s
+; CHECK-GI-NEXT: ushll2 v3.4s, v31.8h, #0
+; CHECK-GI-NEXT: ushll v31.8h, v28.8b, #0
+; CHECK-GI-NEXT: ushll v16.4s, v16.4h, #0
+; CHECK-GI-NEXT: sshll v8.4s, v10.4h, #0
+; CHECK-GI-NEXT: sshll v9.4s, v11.4h, #0
+; CHECK-GI-NEXT: sshll v10.4s, v12.4h, #0
+; CHECK-GI-NEXT: sshll v11.4s, v13.4h, #0
+; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0
+; CHECK-GI-NEXT: stp q3, q25, [sp, #112] // 32-byte Folded Spill
+; CHECK-GI-NEXT: ldr q3, [sp, #208] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ushll2 v28.8h, v28.16b, #0
+; CHECK-GI-NEXT: mla v1.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT: ushll2 v0.4s, v31.8h, #0
+; CHECK-GI-NEXT: mla v5.4s, v16.4s, v8.4s
+; CHECK-GI-NEXT: mla v20.4s, v3.4s, v26.4s
+; CHECK-GI-NEXT: sshll2 v3.4s, v25.8h, #0
+; CHECK-GI-NEXT: mla v6.4s, v30.4s, v10.4s
+; CHECK-GI-NEXT: mla v7.4s, v15.4s, v11.4s
+; CHECK-GI-NEXT: sshll v25.8h, v23.8b, #0
+; CHECK-GI-NEXT: mla v4.4s, v18.4s, v9.4s
+; CHECK-GI-NEXT: ushll v30.8h, v22.8b, #0
+; CHECK-GI-NEXT: ushll2 v26.8h, v22.16b, #0
+; CHECK-GI-NEXT: sshll v22.8h, v21.8b, #0
+; CHECK-GI-NEXT: str q3, [sp, #32] // 16-byte Folded Spill
+; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ushll2 v8.8h, v27.16b, #0
+; CHECK-GI-NEXT: str q1, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: ldr q9, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ushll2 v1.4s, v14.8h, #0
+; CHECK-GI-NEXT: stp q7, q6, [sp, #64] // 32-byte Folded Spill
+; CHECK-GI-NEXT: mla v19.4s, v3.4s, v29.4s
+; CHECK-GI-NEXT: sshll2 v7.4s, v25.8h, #0
+; CHECK-GI-NEXT: str q5, [sp, #176] // 16-byte Folded Spill
+; CHECK-GI-NEXT: ushll v29.8h, v24.8b, #0
+; CHECK-GI-NEXT: ushll2 v27.8h, v24.16b, #0
+; CHECK-GI-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT: ldp q0, q16, [sp, #96] // 32-byte Folded Reload
+; CHECK-GI-NEXT: str q4, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT: sshll2 v24.8h, v23.16b, #0
+; CHECK-GI-NEXT: ushll2 v18.4s, v26.8h, #0
+; CHECK-GI-NEXT: stp q19, q20, [sp, #192] // 32-byte Folded Spill
+; CHECK-GI-NEXT: sshll2 v20.8h, v21.16b, #0
+; CHECK-GI-NEXT: sshll v21.8h, v17.8b, #0
+; CHECK-GI-NEXT: sshll2 v19.8h, v17.16b, #0
+; CHECK-GI-NEXT: sshll2 v17.8h, v0.16b, #0
+; CHECK-GI-NEXT: mul v16.4s, v16.4s, v9.4s
+; CHECK-GI-NEXT: ldr q9, [sp, #16] // 16-byte Folded Reload
+; CHECK-GI-NEXT: sshll v23.8h, v0.8b, #0
+; CHECK-GI-NEXT: sshll2 v2.4s, v22.8h, #0
+; CHECK-GI-NEXT: ushll2 v12.4s, v27.8h, #0
+; CHECK-GI-NEXT: ushll v26.4s, v26.4h, #0
+; CHECK-GI-NEXT: ushll2 v10.4s, v28.8h, #0
+; CHECK-GI-NEXT: sshll2 v0.4s, v17.8h, #0
+; CHECK-GI-NEXT: mul v7.4s, v9.4s, v7.4s
+; CHECK-GI-NEXT: ldr q9, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: sshll2 v5.4s, v19.8h, #0
+; CHECK-GI-NEXT: sshll v17.4s, v17.4h, #0
+; CHECK-GI-NEXT: sshll2 v3.4s, v20.8h, #0
+; CHECK-GI-NEXT: mul v2.4s, v9.4s, v2.4s
+; CHECK-GI-NEXT: ldr q9, [sp, #128] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ushll2 v15.4s, v8.8h, #0
+; CHECK-GI-NEXT: mul v0.4s, v18.4s, v0.4s
+; CHECK-GI-NEXT: ldr q18, [sp, #160] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ushll2 v11.4s, v29.8h, #0
+; CHECK-GI-NEXT: sshll v9.4s, v9.4h, #0
+; CHECK-GI-NEXT: ushll2 v13.4s, v30.8h, #0
+; CHECK-GI-NEXT: sshll2 v1.4s, v24.8h, #0
+; CHECK-GI-NEXT: ushll v18.4s, v18.4h, #0
+; CHECK-GI-NEXT: sshll2 v4.4s, v21.8h, #0
+; CHECK-GI-NEXT: sshll2 v6.4s, v23.8h, #0
+; CHECK-GI-NEXT: mul v5.4s, v12.4s, v5.4s
+; CHECK-GI-NEXT: ushll v27.4s, v27.4h, #0
+; CHECK-GI-NEXT: sshll v19.4s, v19.4h, #0
+; CHECK-GI-NEXT: mla v0.4s, v26.4s, v17.4s
+; CHECK-GI-NEXT: mul v3.4s, v10.4s, v3.4s
+; CHECK-GI-NEXT: mul v1.4s, v15.4s, v1.4s
+; CHECK-GI-NEXT: mla v16.4s, v18.4s, v9.4s
+; CHECK-GI-NEXT: ldp q18, q17, [sp, #192] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mul v4.4s, v11.4s, v4.4s
+; CHECK-GI-NEXT: mul v6.4s, v13.4s, v6.4s
+; CHECK-GI-NEXT: ushll v28.4s, v28.4h, #0
+; CHECK-GI-NEXT: ldp d13, d12, [sp, #240] // 16-byte Folded Reload
+; CHECK-GI-NEXT: sshll v20.4s, v20.4h, #0
+; CHECK-GI-NEXT: ushll v10.4s, v14.4h, #0
+; CHECK-GI-NEXT: ldp d15, d14, [sp, #224] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ushll v8.4s, v8.4h, #0
+; CHECK-GI-NEXT: ushll v31.4s, v31.4h, #0
+; CHECK-GI-NEXT: ushll v29.4s, v29.4h, #0
+; CHECK-GI-NEXT: ushll v30.4s, v30.4h, #0
+; CHECK-GI-NEXT: sshll v25.4s, v25.4h, #0
+; CHECK-GI-NEXT: sshll v24.4s, v24.4h, #0
+; CHECK-GI-NEXT: sshll v22.4s, v22.4h, #0
+; CHECK-GI-NEXT: sshll v21.4s, v21.4h, #0
+; CHECK-GI-NEXT: sshll v23.4s, v23.4h, #0
+; CHECK-GI-NEXT: mla v5.4s, v27.4s, v19.4s
+; CHECK-GI-NEXT: ldr q19, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT: add v17.4s, v17.4s, v18.4s
+; CHECK-GI-NEXT: ldr q18, [sp, #176] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mla v3.4s, v28.4s, v20.4s
+; CHECK-GI-NEXT: mla v7.4s, v10.4s, v25.4s
+; CHECK-GI-NEXT: ldp d11, d10, [sp, #256] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mla v1.4s, v8.4s, v24.4s
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #272] // 16-byte Folded Reload
+; CHECK-GI-NEXT: add v18.4s, v18.4s, v19.4s
+; CHECK-GI-NEXT: ldp q20, q19, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mla v2.4s, v31.4s, v22.4s
+; CHECK-GI-NEXT: mla v4.4s, v29.4s, v21.4s
+; CHECK-GI-NEXT: mla v6.4s, v30.4s, v23.4s
+; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s
+; CHECK-GI-NEXT: add v19.4s, v19.4s, v20.4s
+; CHECK-GI-NEXT: ldr q20, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: add v2.4s, v2.4s, v3.4s
+; CHECK-GI-NEXT: add v16.4s, v20.4s, v16.4s
+; CHECK-GI-NEXT: add v3.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: add v0.4s, v6.4s, v0.4s
+; CHECK-GI-NEXT: add v4.4s, v17.4s, v18.4s
+; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT: add v5.4s, v19.4s, v16.4s
+; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s
+; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT: addv s1, v2.4s
+; CHECK-GI-NEXT: addv s0, v0.4s
+; CHECK-GI-NEXT: fmov w8, s1
+; CHECK-GI-NEXT: fmov w9, s0
+; CHECK-GI-NEXT: add w0, w8, w9
+; CHECK-GI-NEXT: add sp, sp, #304
+; CHECK-GI-NEXT: ret
+entry:
+ %az = zext <64 x i8> %a to <64 x i32>
+ %bz = sext <64 x i8> %b to <64 x i32>
+ %m1 = mul nuw nsw <64 x i32> %az, %bz
+ %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m1)
+ %cz = zext <64 x i8> %c to <64 x i32>
+ %dz = sext <64 x i8> %d to <64 x i32>
+ %m2 = mul nuw nsw <64 x i32> %cz, %dz
+ %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m2)
+ %x = add i32 %r1, %r2
+ ret i32 %x
+}
More information about the llvm-commits
mailing list