[llvm] [AArch64] Push mul into extend operands (PR #94960)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 10 03:58:25 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: David Green (davemgreen)
<details>
<summary>Changes</summary>
In a similar way to how we push vector adds into extends, this pushed 'mul(zext,zext)' into 'zext(mul(zext,zext))' if the extend can be done in two or more steps.
https://alive2.llvm.org/ce/z/WjU7Kr
---
Patch is 79.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94960.diff
5 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+44-36)
- (modified) llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll (+41-67)
- (modified) llvm/test/CodeGen/AArch64/neon-dotreduce.ll (+531-591)
- (modified) llvm/test/CodeGen/AArch64/neon-extmul.ll (+40-66)
- (modified) llvm/test/CodeGen/AArch64/vecreduce-add.ll (+84-132)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 48bf648b00522..87d737d7ffe61 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17720,6 +17720,47 @@ static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
}
+// Transform vector add(zext i8 to i32, zext i8 to i32)
+// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
+// This allows extra uses of saddl/uaddl at the lower vector widths, and less
+// extends.
+static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
+ (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
+ N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
+ (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
+ N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
+ N->getOperand(0).getOperand(0).getValueType() !=
+ N->getOperand(1).getOperand(0).getValueType())
+ return SDValue();
+
+ if (N->getOpcode() == ISD::MUL &&
+ N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0).getOperand(0);
+ SDValue N1 = N->getOperand(1).getOperand(0);
+ EVT InVT = N0.getValueType();
+
+ EVT S1 = InVT.getScalarType();
+ EVT S2 = VT.getScalarType();
+ if ((S2 == MVT::i32 && S1 == MVT::i8) ||
+ (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
+ SDLoc DL(N);
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
+ S2.getHalfSizedIntegerVT(*DAG.getContext()),
+ VT.getVectorElementCount());
+ SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
+ SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
+ SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
+ return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
+ : (unsigned)ISD::SIGN_EXTEND,
+ DL, VT, NewOp);
+ }
+ return SDValue();
+}
+
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -17728,6 +17769,8 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
return Ext;
if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
return Ext;
+ if (SDValue Ext = performVectorExtCombine(N, DAG))
+ return Ext;
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -19604,41 +19647,6 @@ static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
}
-// Transform vector add(zext i8 to i32, zext i8 to i32)
-// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
-// This allows extra uses of saddl/uaddl at the lower vector widths, and less
-// extends.
-static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
- if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
- (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
- N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
- (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
- N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
- N->getOperand(0).getOperand(0).getValueType() !=
- N->getOperand(1).getOperand(0).getValueType())
- return SDValue();
-
- SDValue N0 = N->getOperand(0).getOperand(0);
- SDValue N1 = N->getOperand(1).getOperand(0);
- EVT InVT = N0.getValueType();
-
- EVT S1 = InVT.getScalarType();
- EVT S2 = VT.getScalarType();
- if ((S2 == MVT::i32 && S1 == MVT::i8) ||
- (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
- SDLoc DL(N);
- EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
- S2.getHalfSizedIntegerVT(*DAG.getContext()),
- VT.getVectorElementCount());
- SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
- SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
- SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
- return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
- }
- return SDValue();
-}
-
static SDValue performBuildVectorCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -20260,7 +20268,7 @@ static SDValue performAddSubCombine(SDNode *N,
return Val;
if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
return Val;
- if (SDValue Val = performVectorAddSubExtCombine(N, DCI.DAG))
+ if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
return Val;
if (SDValue Val = performAddCombineForShiftedOperands(N, DCI.DAG))
return Val;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
index 4c0d1efb99498..410c2d9021d6d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll
@@ -28,14 +28,12 @@ entry:
define <16 x i32> @mul_i32(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-LABEL: mul_i32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ushll v2.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v4.8h, v1.8b, #0
-; CHECK-SD-NEXT: ushll2 v5.8h, v0.16b, #0
-; CHECK-SD-NEXT: ushll2 v6.8h, v1.16b, #0
-; CHECK-SD-NEXT: umull v0.4s, v2.4h, v4.4h
-; CHECK-SD-NEXT: umull2 v1.4s, v2.8h, v4.8h
-; CHECK-SD-NEXT: umull2 v3.4s, v5.8h, v6.8h
-; CHECK-SD-NEXT: umull v2.4s, v5.4h, v6.4h
+; CHECK-SD-NEXT: umull v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: umull2 v4.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: ushll v0.4s, v2.4h, #0
+; CHECK-SD-NEXT: ushll2 v3.4s, v4.8h, #0
+; CHECK-SD-NEXT: ushll2 v1.4s, v2.8h, #0
+; CHECK-SD-NEXT: ushll v2.4s, v4.4h, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mul_i32:
@@ -59,26 +57,20 @@ entry:
define <16 x i64> @mul_i64(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-LABEL: mul_i64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ushll v2.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-SD-NEXT: ushll v3.8h, v1.8b, #0
-; CHECK-SD-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-SD-NEXT: ushll v4.4s, v2.4h, #0
-; CHECK-SD-NEXT: ushll v5.4s, v0.4h, #0
-; CHECK-SD-NEXT: ushll v6.4s, v3.4h, #0
+; CHECK-SD-NEXT: umull v2.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: umull2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: ushll v3.4s, v2.4h, #0
; CHECK-SD-NEXT: ushll2 v2.4s, v2.8h, #0
-; CHECK-SD-NEXT: ushll v16.4s, v1.4h, #0
-; CHECK-SD-NEXT: ushll2 v7.4s, v3.8h, #0
-; CHECK-SD-NEXT: ushll2 v17.4s, v0.8h, #0
-; CHECK-SD-NEXT: ushll2 v18.4s, v1.8h, #0
-; CHECK-SD-NEXT: umull2 v1.2d, v4.4s, v6.4s
-; CHECK-SD-NEXT: umull v0.2d, v4.2s, v6.2s
-; CHECK-SD-NEXT: umull2 v3.2d, v2.4s, v7.4s
-; CHECK-SD-NEXT: umull v2.2d, v2.2s, v7.2s
-; CHECK-SD-NEXT: umull v4.2d, v5.2s, v16.2s
-; CHECK-SD-NEXT: umull2 v7.2d, v17.4s, v18.4s
-; CHECK-SD-NEXT: umull2 v5.2d, v5.4s, v16.4s
-; CHECK-SD-NEXT: umull v6.2d, v17.2s, v18.2s
+; CHECK-SD-NEXT: ushll v5.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll2 v6.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll2 v1.2d, v3.4s, #0
+; CHECK-SD-NEXT: ushll v0.2d, v3.2s, #0
+; CHECK-SD-NEXT: ushll2 v3.2d, v2.4s, #0
+; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
+; CHECK-SD-NEXT: ushll v4.2d, v5.2s, #0
+; CHECK-SD-NEXT: ushll2 v7.2d, v6.4s, #0
+; CHECK-SD-NEXT: ushll2 v5.2d, v5.4s, #0
+; CHECK-SD-NEXT: ushll v6.2d, v6.2s, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mul_i64:
@@ -139,17 +131,12 @@ entry:
define <16 x i32> @mla_i32(<16 x i8> %a, <16 x i8> %b, <16 x i32> %c) {
; CHECK-SD-LABEL: mla_i32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ushll v6.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v7.8h, v1.8b, #0
-; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-SD-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-SD-NEXT: umlal v2.4s, v6.4h, v7.4h
-; CHECK-SD-NEXT: umlal2 v3.4s, v6.8h, v7.8h
-; CHECK-SD-NEXT: umlal2 v5.4s, v0.8h, v1.8h
-; CHECK-SD-NEXT: umlal v4.4s, v0.4h, v1.4h
-; CHECK-SD-NEXT: mov v0.16b, v2.16b
-; CHECK-SD-NEXT: mov v1.16b, v3.16b
-; CHECK-SD-NEXT: mov v2.16b, v4.16b
+; CHECK-SD-NEXT: umull2 v7.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: umull v6.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: uaddw2 v5.4s, v5.4s, v7.8h
+; CHECK-SD-NEXT: uaddw v0.4s, v2.4s, v6.4h
+; CHECK-SD-NEXT: uaddw2 v1.4s, v3.4s, v6.8h
+; CHECK-SD-NEXT: uaddw v2.4s, v4.4s, v7.4h
; CHECK-SD-NEXT: mov v3.16b, v5.16b
; CHECK-SD-NEXT: ret
;
@@ -179,35 +166,22 @@ entry:
define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) {
; CHECK-SD-LABEL: mla_i64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov v17.16b, v7.16b
-; CHECK-SD-NEXT: mov v16.16b, v6.16b
-; CHECK-SD-NEXT: ushll v6.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll2 v0.8h, v0.16b, #0
-; CHECK-SD-NEXT: ushll v7.8h, v1.8b, #0
-; CHECK-SD-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-SD-NEXT: ushll v18.4s, v6.4h, #0
-; CHECK-SD-NEXT: ushll2 v21.4s, v6.8h, #0
-; CHECK-SD-NEXT: ushll v19.4s, v0.4h, #0
-; CHECK-SD-NEXT: ushll v20.4s, v7.4h, #0
-; CHECK-SD-NEXT: ushll v22.4s, v1.4h, #0
-; CHECK-SD-NEXT: ushll2 v23.4s, v7.8h, #0
-; CHECK-SD-NEXT: ldp q6, q7, [sp]
-; CHECK-SD-NEXT: ushll2 v0.4s, v0.8h, #0
-; CHECK-SD-NEXT: ushll2 v1.4s, v1.8h, #0
-; CHECK-SD-NEXT: umlal2 v3.2d, v18.4s, v20.4s
-; CHECK-SD-NEXT: umlal v2.2d, v18.2s, v20.2s
-; CHECK-SD-NEXT: umlal v16.2d, v19.2s, v22.2s
-; CHECK-SD-NEXT: umlal2 v5.2d, v21.4s, v23.4s
-; CHECK-SD-NEXT: umlal v4.2d, v21.2s, v23.2s
-; CHECK-SD-NEXT: umlal2 v17.2d, v19.4s, v22.4s
-; CHECK-SD-NEXT: umlal2 v7.2d, v0.4s, v1.4s
-; CHECK-SD-NEXT: umlal v6.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT: mov v0.16b, v2.16b
-; CHECK-SD-NEXT: mov v1.16b, v3.16b
-; CHECK-SD-NEXT: mov v2.16b, v4.16b
-; CHECK-SD-NEXT: mov v3.16b, v5.16b
-; CHECK-SD-NEXT: mov v4.16b, v16.16b
-; CHECK-SD-NEXT: mov v5.16b, v17.16b
+; CHECK-SD-NEXT: umull v16.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT: umull2 v0.8h, v0.16b, v1.16b
+; CHECK-SD-NEXT: ldp q20, q21, [sp]
+; CHECK-SD-NEXT: ushll v17.4s, v16.4h, #0
+; CHECK-SD-NEXT: ushll2 v16.4s, v16.8h, #0
+; CHECK-SD-NEXT: ushll2 v19.4s, v0.8h, #0
+; CHECK-SD-NEXT: ushll v18.4s, v0.4h, #0
+; CHECK-SD-NEXT: uaddw2 v1.2d, v3.2d, v17.4s
+; CHECK-SD-NEXT: uaddw v0.2d, v2.2d, v17.2s
+; CHECK-SD-NEXT: uaddw2 v3.2d, v5.2d, v16.4s
+; CHECK-SD-NEXT: uaddw v2.2d, v4.2d, v16.2s
+; CHECK-SD-NEXT: uaddw2 v16.2d, v21.2d, v19.4s
+; CHECK-SD-NEXT: uaddw v4.2d, v6.2d, v18.2s
+; CHECK-SD-NEXT: uaddw2 v5.2d, v7.2d, v18.4s
+; CHECK-SD-NEXT: uaddw v6.2d, v20.2d, v19.2s
+; CHECK-SD-NEXT: mov v7.16b, v16.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: mla_i64:
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 40b8a47f92aa7..33245a2b120ea 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -132,13 +132,12 @@ define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b,
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: movi v3.2d, #0000000000000000
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: umull2 v2.4s, v1.8h, v0.8h
-; CHECK-NEXT: mov v3.s[0], v2.s[0]
-; CHECK-NEXT: umlal v3.4s, v1.4h, v0.4h
-; CHECK-NEXT: addv s0, v3.4s
+; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: ushll2 v2.4s, v0.8h, #0
+; CHECK-NEXT: mov v1.s[0], v2.s[0]
+; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w2
; CHECK-NEXT: ret
@@ -176,13 +175,12 @@ define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b,
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: ldr d1, [x1]
-; CHECK-NEXT: movi v3.2d, #0000000000000000
-; CHECK-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-NEXT: smull2 v2.4s, v1.8h, v0.8h
-; CHECK-NEXT: mov v3.s[0], v2.s[0]
-; CHECK-NEXT: smlal v3.4s, v1.4h, v0.4h
-; CHECK-NEXT: addv s0, v3.4s
+; CHECK-NEXT: smull v0.8h, v1.8b, v0.8b
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: sshll2 v2.4s, v0.8h, #0
+; CHECK-NEXT: mov v1.s[0], v2.s[0]
+; CHECK-NEXT: saddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w2
; CHECK-NEXT: ret
@@ -200,19 +198,17 @@ entry:
define i32 @test_sdot_v5i8_double(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) {
; CHECK-LABEL: test_sdot_v5i8_double:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: sshll v2.8h, v2.8b, #0
-; CHECK-NEXT: sshll v3.8h, v3.8b, #0
-; CHECK-NEXT: movi v5.2d, #0000000000000000
-; CHECK-NEXT: movi v6.2d, #0000000000000000
-; CHECK-NEXT: smull2 v4.4s, v0.8h, v1.8h
-; CHECK-NEXT: smull2 v7.4s, v2.8h, v3.8h
-; CHECK-NEXT: mov v6.s[0], v4.s[0]
-; CHECK-NEXT: mov v5.s[0], v7.s[0]
-; CHECK-NEXT: smlal v6.4s, v0.4h, v1.4h
-; CHECK-NEXT: smlal v5.4s, v2.4h, v3.4h
-; CHECK-NEXT: add v0.4s, v6.4s, v5.4s
+; CHECK-NEXT: smull v2.8h, v2.8b, v3.8b
+; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: movi v3.2d, #0000000000000000
+; CHECK-NEXT: sshll2 v4.4s, v0.8h, #0
+; CHECK-NEXT: sshll2 v5.4s, v2.8h, #0
+; CHECK-NEXT: mov v3.s[0], v4.s[0]
+; CHECK-NEXT: mov v1.s[0], v5.s[0]
+; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h
+; CHECK-NEXT: saddw v1.4s, v1.4s, v2.4h
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
@@ -998,27 +994,21 @@ entry:
define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
; CHECK-LABEL: test_udot_v25i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldp q2, q0, [x0]
-; CHECK-NEXT: ldp q5, q1, [x1]
-; CHECK-NEXT: ushll2 v3.8h, v0.16b, #0
-; CHECK-NEXT: ushll v6.8h, v2.8b, #0
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0
-; CHECK-NEXT: ushll v7.8h, v5.8b, #0
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0
-; CHECK-NEXT: umull v3.4s, v4.4h, v3.4h
-; CHECK-NEXT: movi v4.2d, #0000000000000000
-; CHECK-NEXT: umull2 v16.4s, v7.8h, v6.8h
-; CHECK-NEXT: umull v6.4s, v7.4h, v6.4h
-; CHECK-NEXT: mov v4.s[0], v3.s[0]
-; CHECK-NEXT: ushll2 v3.8h, v5.16b, #0
-; CHECK-NEXT: umlal2 v16.4s, v1.8h, v0.8h
-; CHECK-NEXT: umlal v6.4s, v1.4h, v0.4h
-; CHECK-NEXT: umlal v4.4s, v3.4h, v2.4h
-; CHECK-NEXT: umlal2 v16.4s, v3.8h, v2.8h
-; CHECK-NEXT: add v0.4s, v6.4s, v4.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v16.4s
+; CHECK-NEXT: ldp q3, q0, [x1]
+; CHECK-NEXT: movi v5.2d, #0000000000000000
+; CHECK-NEXT: ldp q2, q1, [x0]
+; CHECK-NEXT: umull2 v4.8h, v0.16b, v1.16b
+; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: umull v1.8h, v3.8b, v2.8b
+; CHECK-NEXT: umull2 v2.8h, v3.16b, v2.16b
+; CHECK-NEXT: ushll v3.4s, v4.4h, #0
+; CHECK-NEXT: uaddl2 v4.4s, v1.8h, v0.8h
+; CHECK-NEXT: uaddl v0.4s, v1.4h, v0.4h
+; CHECK-NEXT: mov v5.s[0], v3.s[0]
+; CHECK-NEXT: uaddw2 v1.4s, v4.4s, v2.8h
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: uaddw v2.4s, v5.4s, v2.4h
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w2
@@ -1063,27 +1053,21 @@ entry:
define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
; CHECK-LABEL: test_sdot_v25i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldp q2, q0, [x0]
-; CHECK-NEXT: ldp q5, q1, [x1]
-; CHECK-NEXT: sshll2 v3.8h, v0.16b, #0
-; CHECK-NEXT: sshll v6.8h, v2.8b, #0
-; CHECK-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: sshll2 v4.8h, v1.16b, #0
-; CHECK-NEXT: sshll v7.8h, v5.8b, #0
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0
-; CHECK-NEXT: smull v3.4s, v4.4h, v3.4h
-; CHECK-NEXT: movi v4.2d, #0000000000000000
-; CHECK-NEXT: smull2 v16.4s, v7.8h, v6.8h
-; CHECK-NEXT: smull v6.4s, v7.4h, v6.4h
-; CHECK-NEXT: mov v4.s[0], v3.s[0]
-; CHECK-NEXT: sshll2 v3.8h, v5.16b, #0
-; CHECK-NEXT: smlal2 v16.4s, v1.8h, v0.8h
-; CHECK-NEXT: smlal v6.4s, v1.4h, v0.4h
-; CHECK-NEXT: smlal v4.4s, v3.4h, v2.4h
-; CHECK-NEXT: smlal2 v16.4s, v3.8h, v2.8h
-; CHECK-NEXT: add v0.4s, v6.4s, v4.4s
-; CHECK-NEXT: add v0.4s, v0.4s, v16.4s
+; CHECK-NEXT: ldp q3, q0, [x1]
+; CHECK-NEXT: movi v5.2d, #0000000000000000
+; CHECK-NEXT: ldp q2, q1, [x0]
+; CHECK-NEXT: smull2 v4.8h, v0.16b, v1.16b
+; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b
+; CHECK-NEXT: smull v1.8h, v3.8b, v2.8b
+; CHECK-NEXT: smull2 v2.8h, v3.16b, v2.16b
+; CHECK-NEXT: sshll v3.4s, v4.4h, #0
+; CHECK-NEXT: saddl2 v4.4s, v1.8h, v0.8h
+; CHECK-NEXT: saddl v0.4s, v1.4h, v0.4h
+; CHECK-NEXT: mov v5.s[0], v3.s[0]
+; CHECK-NEXT: saddw2 v1.4s, v4.4s, v2.8h
+; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w2
@@ -1105,222 +1089,210 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ldr b1, [sp, #16]
-; CHECK-NEXT: ldr b0, [sp, #80]
-; CHECK-NEXT: add x11, sp, #24
-; CHECK-NEXT: ldr b3, [sp, #216]
-; CHECK-NEXT: add x10, sp, #88
-; CHECK-NEXT: ldr b2, [sp, #280]
-; CHECK-NEXT: ld1 { v1.b }[1], [x11]
-; CHECK-NEXT: add x11, sp, #224
-; CHECK-NEXT: ldr b4, [sp, #152]
-; CHECK-NEXT: ldr b6, [sp, #480]
-; CHECK-NEXT: ld1 { v0.b }[1], [x10]
-; CHECK-NEXT: add x10, sp, #288
-; CHECK-NEXT: add x12, sp, #160
-; CHECK-NEXT: ld1 { v3.b }[1], [x11]
-; CHECK-NEXT: add x11, sp, #488
-; CHECK-NEXT: ld1 { v2.b }[1], [x10]
-; CHECK-NEXT: ld1 { v4.b }[1], [x12]
-; CHECK-NEXT: ld1 { v6.b }[1], [x11]
-; CHECK-NEXT: add x11, sp, #32
-; CHECK-NEXT: add x9, sp, #96
-; CHECK-NEXT: add x8, sp, #104
-; CHECK-NEXT: ld1 { v1.b }[2], [x11]
-; CHECK-NEXT: add x11, sp, #232
-; CHECK-NEXT: ld1 { v0.b }[2], [x9]
+; CHECK-NEXT: ldr b0, [sp, #280]
+; CHECK-NEXT: add x8, sp, #288
+; CHECK-NEXT: ldr b1, [sp, #80]
+; CHECK-NEXT: ldr b2, [sp, #152]
; CHECK-NEXT: add x9, sp, #296
-; CHECK-NEXT: ld1 { v3.b }[2], [x11]
-; CHECK-NEXT: add x11, sp, #168
+; CHECK-NEXT: ldr b4, [sp, #216]
+; CHECK-NEXT: ld1 { v0.b }[1], [x8]
+; CHECK-NEXT: add x8, sp, #88
+; CHECK-NEXT: add x10, sp, #320
+; CHECK-NEXT: ld1 { v1.b }[1], [x8]
+; CHECK-NEXT: add x8, sp, #160
+; CHECK-NEXT: add x12, sp, #192
+; CHECK-NEXT: ld1 { v2.b }[1], [x8]
+; CHECK-NEXT: add x8, sp, #304
+; CHECK-NEXT: add x11, sp, #328
+; CHECK-NEXT: ld1 { v0.b }[2], [x9]
+; CHECK-NEXT: add x9, sp, #96
+; CHECK-NEXT: ldr b5, [sp, #16]
+; CHECK-NEXT: ld1 { v1.b }[2], [x9]
+; CHECK-NEXT: add x9, sp, #168
+; CHECK-NEXT: ldr b6, [sp, #680]
; CHECK-NEXT: ld1 { v2.b }[2], [x9]
-; CHECK-NEXT: ld1 { v4.b }[2], [x11]
-; CHECK-NEXT: add x11, sp, #40
-; CHECK-NEXT: ld1 { v1.b }[3], [x11]
+; CHECK-NEXT: add x9, sp, #104
+; CHECK-NEXT: ldr b7, [sp, #480]
; CHECK-NEXT: ld1 { v0.b }[3], [x8]
-; CHECK-NEXT: add x8, sp, #304
-; CHECK-NEXT: add ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/94960
More information about the llvm-commits
mailing list