[llvm] [AArch64][SVE] Add lowering for PARTIAL_REDUCE_U/SMLA to USDOT (PR #131327)
Nicholas Guy via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 23 06:16:02 PDT 2025
https://github.com/NickGuy-Arm updated https://github.com/llvm/llvm-project/pull/131327
>From 94bf3481b6114ca2ff9790c9f8cf839a5994b88e Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Fri, 28 Feb 2025 17:31:08 +0000
Subject: [PATCH] [AArch64][SVE] Add lowering for PARTIAL_REDUCE_U/SMLA to
USDOT
Add lowering for PARTIAL_REDUCE_U/SMLA nodes to USDOT instructions.
This happens when there is a MUL instruction as the second operand
in the ISD node. Then the extends on the operands of the MUL op
need to have a different signedness.
---
.../CodeGen/SelectionDAG/LegalizeTypes.cpp | 15 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 81 +++++++++
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1 +
.../AArch64/sve-partial-reduce-dot-product.ll | 158 ++----------------
4 files changed, 109 insertions(+), 146 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 83fade45d1892..1af60d6896e6d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -924,8 +924,19 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
/// illegal ResNo in that case.
bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
// See if the target wants to custom lower this node.
- if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
- return false;
+ unsigned Opcode = N->getOpcode();
+ bool IsPRMLAOpcode =
+ Opcode == ISD::PARTIAL_REDUCE_UMLA || Opcode == ISD::PARTIAL_REDUCE_SMLA;
+
+ if (IsPRMLAOpcode) {
+ if (TLI.getPartialReduceMLAAction(N->getValueType(0),
+ N->getOperand(1).getValueType()) !=
+ TargetLowering::Custom)
+ return false;
+ } else {
+ if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom)
+ return false;
+ }
SmallVector<SDValue, 8> Results;
if (LegalizeResult)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 447794cc2b744..66ab66063614c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7756,6 +7756,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerFLDEXP(Op, DAG);
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
return LowerVECTOR_HISTOGRAM(Op, DAG);
+ case ISD::PARTIAL_REDUCE_UMLA:
+ case ISD::PARTIAL_REDUCE_SMLA:
+ return LowerPARTIAL_REDUCE_MLA(Op, DAG);
}
}
@@ -27560,6 +27563,10 @@ void AArch64TargetLowering::ReplaceNodeResults(
if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
Results.push_back(Res);
return;
+ case ISD::PARTIAL_REDUCE_UMLA:
+ case ISD::PARTIAL_REDUCE_SMLA:
+ Results.push_back(LowerPARTIAL_REDUCE_MLA(SDValue(N, 0), DAG));
+ return;
case ISD::ADD:
case ISD::FADD:
ReplaceAddWithADDP(N, Results, DAG, Subtarget);
@@ -29506,6 +29513,80 @@ SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
return Scatter;
}
+// Lower PARTIAL_REDUCE_*MLA(Acc, MUL(ZEXT(MulOpLHS), SEXT(MulOpRHS)), Splat 1)
+// to USDOT(Acc, MulOpLHS, MulOpRHS)
+// Lower PARTIAL_REDUCE_*MLA(Acc, MUL(SEXT(MulOpLHS), ZEXT(MulOpRHS)), Splat 1)
+// to USDOT(Acc, MulOpRHS, MulOpLHS)
+SDValue
+AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
+ SelectionDAG &DAG) const {
+ bool Scalable = Op.getValueType().isScalableVector();
+ auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+ if (Scalable && !Subtarget.isSVEorStreamingSVEAvailable())
+ return SDValue();
+ if (!Scalable && (!Subtarget.isNeonAvailable() || !Subtarget.hasDotProd()))
+ return SDValue();
+ if (!Subtarget.hasMatMulInt8())
+ return SDValue();
+ SDLoc DL(Op);
+
+ if (Op.getOperand(1).getOpcode() != ISD::MUL)
+ return SDValue();
+
+ SDValue Acc = Op.getOperand(0);
+ SDValue Mul = Op.getOperand(1);
+
+ APInt ConstantOne;
+ if (!ISD::isConstantSplatVector(Op.getOperand(2).getNode(), ConstantOne) ||
+ !ConstantOne.isOne())
+ return SDValue();
+
+ SDValue ExtMulOpLHS = Mul.getOperand(0);
+ SDValue ExtMulOpRHS = Mul.getOperand(1);
+ unsigned ExtMulOpLHSOpcode = ExtMulOpLHS.getOpcode();
+ unsigned ExtMulOpRHSOpcode = ExtMulOpRHS.getOpcode();
+ if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
+ !ISD::isExtOpcode(ExtMulOpRHSOpcode))
+ return SDValue();
+
+ SDValue MulOpLHS = ExtMulOpLHS.getOperand(0);
+ SDValue MulOpRHS = ExtMulOpRHS.getOperand(0);
+ EVT MulOpLHSVT = MulOpLHS.getValueType();
+ if (MulOpLHSVT != MulOpRHS.getValueType())
+ return SDValue();
+
+ bool LHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
+ bool RHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
+ if (LHSIsSigned == RHSIsSigned)
+ return SDValue();
+
+ EVT AccVT = Acc.getValueType();
+ // There is no nxv2i64 version of usdot
+ if (Scalable && AccVT != MVT::nxv4i32 && AccVT != MVT::nxv4i64)
+ return SDValue();
+
+ // USDOT expects the signed operand to be last
+ if (!RHSIsSigned)
+ std::swap(MulOpLHS, MulOpRHS);
+
+ unsigned Opcode = AArch64ISD::USDOT;
+ // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
+ // product followed by a zero / sign extension
+ // Don't want this to be split because there is no nxv2i64 version of usdot
+ if ((AccVT == MVT::nxv4i64 && MulOpLHSVT == MVT::nxv16i8) ||
+ (AccVT == MVT::v4i64 && MulOpLHSVT == MVT::v16i8)) {
+ EVT AccVTI32 = (AccVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
+
+ SDValue DotI32 =
+ DAG.getNode(Opcode, DL, AccVTI32, DAG.getConstant(0, DL, AccVTI32),
+ MulOpLHS, MulOpRHS);
+ SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, AccVT);
+ return DAG.getNode(ISD::ADD, DL, AccVT, Acc, Extended);
+ }
+
+ return DAG.getNode(Opcode, DL, AccVT, Acc, MulOpLHS, MulOpRHS);
+}
+
SDValue
AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d9b535b910b80..9d8d1c22258be 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1181,6 +1181,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_HISTOGRAM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index ed27f40aba774..f0c35b191c0a4 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -106,25 +106,7 @@ define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
;
; CHECK-NEWLOWERING-LABEL: usdot:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: ptrue p0.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
-; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
-; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT: usdot z0.s, z1.b, z2.b
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -165,25 +147,7 @@ define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
;
; CHECK-NEWLOWERING-LABEL: sudot:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: ptrue p0.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
-; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
-; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT: usdot z0.s, z2.b, z1.b
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -415,59 +379,12 @@ define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
;
; CHECK-NEWLOWERING-LABEL: usdot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
-; CHECK-NEWLOWERING-NEXT: ptrue p0.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
-; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: mov z4.s, #0 // =0x0
+; CHECK-NEWLOWERING-NEXT: usdot z4.s, z2.b, z3.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z4.s
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -548,59 +465,12 @@ define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
;
; CHECK-NEWLOWERING-LABEL: sudot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
-; CHECK-NEWLOWERING-NEXT: ptrue p0.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
-; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: mov z4.s, #0 // =0x0
+; CHECK-NEWLOWERING-NEXT: usdot z4.s, z3.b, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z4.s
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
More information about the llvm-commits
mailing list