[llvm] 1af3f59 - [DAG] Fold Op(vecreduce(a), vecreduce(b)) into vecreduce(Op(a,b))
David Green via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 8 03:43:41 PST 2023
Author: David Green
Date: 2023-02-08T11:43:36Z
New Revision: 1af3f596f6c6b213cec9b3acd7099f8c4f11d0d0
URL: https://github.com/llvm/llvm-project/commit/1af3f596f6c6b213cec9b3acd7099f8c4f11d0d0
DIFF: https://github.com/llvm/llvm-project/commit/1af3f596f6c6b213cec9b3acd7099f8c4f11d0d0.diff
LOG: [DAG] Fold Op(vecreduce(a), vecreduce(b)) into vecreduce(Op(a,b))
So long as the operation is reassociative, we can reassociate the double
vecreduce from for example fadd(vecreduce(a), vecreduce(b)) to
vecreduce(fadd(a,b)). This will in general save a few instructions, but some
architectures (MVE) require the opposite fold, so a shouldExpandReduction is
added to account for it. Only targets that use shouldExpandReduction will be
affected.
Differential Revision: https://reviews.llvm.org/D141870
Added:
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/ARM/ARMISelLowering.h
llvm/test/CodeGen/AArch64/aarch64-addv.ll
llvm/test/CodeGen/AArch64/double_reduct.ll
llvm/test/CodeGen/AArch64/sve-doublereduct.ll
llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
llvm/test/CodeGen/AArch64/vecreduce-add.ll
llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
llvm/test/CodeGen/RISCV/double_reduct.ll
llvm/test/CodeGen/Thumb2/mve-doublereduct.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e26540a684c1..2046f4b8a085 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -444,6 +444,12 @@ class TargetLoweringBase {
return true;
}
+ // Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to
+ // vecreduce(op(x, y)) for the reduction opcode RedOpc.
+ virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const {
+ return true;
+ }
+
/// Return true if it is profitable to convert a select of FP constants into
/// a constant pool load whose address depends on the select condition. The
/// parameter may be used to
diff erentiate a select with FP compare from
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b6e0578dba9b..7249c637028f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -550,6 +550,9 @@ namespace {
SDValue N1);
SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
SDValue N1, SDNodeFlags Flags);
+ SDValue reassociateReduction(unsigned ResOpc, unsigned Opc, const SDLoc &DL,
+ EVT VT, SDValue N0, SDValue N1,
+ SDNodeFlags Flags = SDNodeFlags());
SDValue visitShiftByConstant(SDNode *N);
@@ -1310,6 +1313,25 @@ SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
return SDValue();
}
+// Try to fold Opc(vecreduce(x), vecreduce(y)) -> vecreduce(Opc(x, y))
+// Note that we only expect Flags to be passed from FP operations. For integer
+// operations they need to be dropped.
+SDValue DAGCombiner::reassociateReduction(unsigned RedOpc, unsigned Opc,
+ const SDLoc &DL, EVT VT, SDValue N0,
+ SDValue N1, SDNodeFlags Flags) {
+ if (N0.getOpcode() == RedOpc && N1.getOpcode() == RedOpc &&
+ N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
+ N0->hasOneUse() && N1->hasOneUse() &&
+ TLI.isOperationLegalOrCustom(Opc, N0.getOperand(0).getValueType()) &&
+ TLI.shouldReassociateReduction(RedOpc, N0.getOperand(0).getValueType())) {
+ SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
+ return DAG.getNode(RedOpc, DL, VT,
+ DAG.getNode(Opc, DL, N0.getOperand(0).getValueType(),
+ N0.getOperand(0), N1.getOperand(0)));
+ }
+ return SDValue();
+}
+
SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
bool AddTo) {
assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
@@ -2650,6 +2672,11 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
return Add;
if (SDValue Add = ReassociateAddOr(N1, N0))
return Add;
+
+ // Fold add(vecreduce(x), vecreduce(y)) -> vecreduce(add(x, y))
+ if (SDValue SD =
+ reassociateReduction(ISD::VECREDUCE_ADD, ISD::ADD, DL, VT, N0, N1))
+ return SD;
}
// fold ((0-A) + B) -> B-A
if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0)))
@@ -4351,6 +4378,11 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags()))
return RMUL;
+ // Fold mul(vecreduce(x), vecreduce(y)) -> vecreduce(mul(x, y))
+ if (SDValue SD =
+ reassociateReduction(ISD::VECREDUCE_MUL, ISD::MUL, DL, VT, N0, N1))
+ return SD;
+
// Simplify the operands using demanded-bits information.
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
@@ -5486,6 +5518,25 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N0, N1, ISD::SETULT, DAG))
return S;
+ // Fold and(vecreduce(x), vecreduce(y)) -> vecreduce(and(x, y))
+ auto ReductionOpcode = [](unsigned Opcode) {
+ switch (Opcode) {
+ case ISD::SMIN:
+ return ISD::VECREDUCE_SMIN;
+ case ISD::SMAX:
+ return ISD::VECREDUCE_SMAX;
+ case ISD::UMIN:
+ return ISD::VECREDUCE_UMIN;
+ case ISD::UMAX:
+ return ISD::VECREDUCE_UMAX;
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+ };
+ if (SDValue SD = reassociateReduction(ReductionOpcode(Opcode), Opcode,
+ SDLoc(N), VT, N0, N1))
+ return SD;
+
// Simplify the operands using demanded-bits information.
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
@@ -6525,6 +6576,11 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags()))
return RAND;
+ // Fold and(vecreduce(x), vecreduce(y)) -> vecreduce(and(x, y))
+ if (SDValue SD = reassociateReduction(ISD::VECREDUCE_AND, ISD::AND, SDLoc(N),
+ VT, N0, N1))
+ return SD;
+
// fold (and (or x, C), D) -> D if (C & D) == D
auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) {
return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
@@ -7419,6 +7475,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags()))
return ROR;
+ // Fold or(vecreduce(x), vecreduce(y)) -> vecreduce(or(x, y))
+ if (SDValue SD = reassociateReduction(ISD::VECREDUCE_OR, ISD::OR, SDLoc(N),
+ VT, N0, N1))
+ return SD;
+
// Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
// iff (c1 & c2) != 0 or c1/c2 are undef.
auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) {
@@ -8903,6 +8964,11 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags()))
return RXOR;
+ // Fold xor(vecreduce(x), vecreduce(y)) -> vecreduce(xor(x, y))
+ if (SDValue SD =
+ reassociateReduction(ISD::VECREDUCE_XOR, ISD::XOR, DL, VT, N0, N1))
+ return SD;
+
// fold (a^b) -> (a|b) iff a and b share no bits.
if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
DAG.haveNoCommonBitsSet(N0, N1))
@@ -15621,6 +15687,11 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
DAG.getConstantFP(4.0, DL, VT));
}
}
+
+ // Fold fadd(vecreduce(x), vecreduce(y)) -> vecreduce(fadd(x, y))
+ if (SDValue SD = reassociateReduction(ISD::VECREDUCE_FADD, ISD::FADD, DL,
+ VT, N0, N1, Flags))
+ return SD;
} // enable-unsafe-fp-math
// FADD -> FMA combines:
@@ -15795,6 +15866,11 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1);
return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts);
}
+
+ // Fold fmul(vecreduce(x), vecreduce(y)) -> vecreduce(fmul(x, y))
+ if (SDValue SD = reassociateReduction(ISD::VECREDUCE_FMUL, ISD::FMUL, DL,
+ VT, N0, N1, Flags))
+ return SD;
}
// fold (fmul X, 2.0) -> (fadd X, X)
@@ -16845,6 +16921,14 @@ SDValue DAGCombiner::visitFMinMax(SDNode *N) {
}
}
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if ((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
+ (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros()))
+ if (SDValue SD = reassociateReduction(IsMin ? ISD::VECREDUCE_FMIN
+ : ISD::VECREDUCE_FMAX,
+ Opc, SDLoc(N), VT, N0, N1, Flags))
+ return SD;
+
return SDValue();
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 86ad9a476796..3bc936b6cce2 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -617,6 +617,10 @@ class VectorType;
return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
}
+ bool shouldReassociateReduction(unsigned Opc, EVT VT) const override {
+ return Opc != ISD::VECREDUCE_ADD;
+ }
+
/// Returns true if an argument of type Ty needs to be passed in a
/// contiguous block of registers in calling convention CallConv.
bool functionArgumentNeedsConsecutiveRegisters(
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index 2b71126ee175..15736933b61f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -102,11 +102,9 @@ define i32 @oversized_ADDV_512(ptr %arr) {
define i8 @addv_combine_i8(<8 x i8> %a1, <8 x i8> %a2) {
; CHECK-LABEL: addv_combine_i8:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add v0.8b, v0.8b, v1.8b
; CHECK-NEXT: addv b0, v0.8b
-; CHECK-NEXT: addv b1, v1.8b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%rdx.1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a1)
@@ -118,11 +116,9 @@ entry:
define i16 @addv_combine_i16(<4 x i16> %a1, <4 x i16> %a2) {
; CHECK-LABEL: addv_combine_i16:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-NEXT: addv h0, v0.4h
-; CHECK-NEXT: addv h1, v1.4h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%rdx.1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a1)
diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll
index 1fd1eb6fc5dd..78408ae10e0b 100644
--- a/llvm/test/CodeGen/AArch64/double_reduct.ll
+++ b/llvm/test/CodeGen/AArch64/double_reduct.ll
@@ -5,11 +5,9 @@ define float @add_f32(<8 x float> %a, <4 x float> %b) {
; CHECK-LABEL: add_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: faddp v2.4s, v2.4s, v2.4s
+; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: faddp s1, v2.2s
; CHECK-NEXT: faddp s0, v0.2s
-; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
%r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
@@ -20,14 +18,11 @@ define float @add_f32(<8 x float> %a, <4 x float> %b) {
define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
; CHECK-LABEL: fmul_f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: fmul v1.2s, v2.2s, v3.2s
-; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: fmul s1, s1, v1.s[1]
-; CHECK-NEXT: fmul v0.2s, v0.2s, v2.2s
+; CHECK-NEXT: fmul v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s
; CHECK-NEXT: fmul s0, s0, v0.s[1]
-; CHECK-NEXT: fmul s0, s0, s1
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
%r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b)
@@ -81,11 +76,10 @@ define i32 @add_i32(<8 x i32> %a, <4 x i32> %b) {
define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: add_ext_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: uaddlv h0, v0.16b
-; CHECK-NEXT: uaddlv h1, v1.16b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: uaddlp v1.8h, v1.16b
+; CHECK-NEXT: uadalp v1.8h, v0.16b
+; CHECK-NEXT: addv h0, v1.8h
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%ae = zext <16 x i8> %a to <16 x i16>
%be = zext <16 x i8> %b to <16 x i16>
@@ -100,12 +94,10 @@ define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: uaddl2 v3.8h, v0.16b, v1.16b
; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: uaddlv h2, v2.16b
; CHECK-NEXT: add v0.8h, v0.8h, v3.8h
-; CHECK-NEXT: fmov w9, s2
+; CHECK-NEXT: uadalp v0.8h, v2.16b
; CHECK-NEXT: addv h0, v0.8h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%ae = zext <32 x i8> %a to <32 x i16>
%be = zext <16 x i8> %b to <16 x i16>
@@ -118,18 +110,13 @@ define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: mul_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: mul v1.2s, v2.2s, v3.2s
-; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: mov w9, v1.s[1]
-; CHECK-NEXT: fmov w11, s1
-; CHECK-NEXT: mul v0.2s, v0.2s, v2.2s
-; CHECK-NEXT: mul w9, w11, w9
+; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s
; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w10, s0
-; CHECK-NEXT: mul w8, w10, w8
-; CHECK-NEXT: mul w0, w8, w9
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: mul w0, w9, w8
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
@@ -141,16 +128,11 @@ define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: and_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: and v2.8b, v2.8b, v3.8b
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v2.s[1]
-; CHECK-NEXT: mov w9, v0.s[1]
-; CHECK-NEXT: fmov w10, s0
-; CHECK-NEXT: fmov w11, s2
-; CHECK-NEXT: and w9, w10, w9
-; CHECK-NEXT: and w8, w11, w8
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: and w0, w9, w8
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a)
@@ -163,16 +145,11 @@ define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: or_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: orr v2.8b, v2.8b, v3.8b
; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v2.s[1]
-; CHECK-NEXT: mov w9, v0.s[1]
-; CHECK-NEXT: fmov w10, s0
-; CHECK-NEXT: fmov w11, s2
-; CHECK-NEXT: orr w9, w10, w9
-; CHECK-NEXT: orr w8, w11, w8
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: orr w0, w9, w8
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a)
@@ -185,16 +162,11 @@ define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: xor_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v2.s[1]
-; CHECK-NEXT: mov w9, v0.s[1]
-; CHECK-NEXT: fmov w10, s0
-; CHECK-NEXT: fmov w11, s2
-; CHECK-NEXT: eor w9, w10, w9
-; CHECK-NEXT: eor w8, w11, w8
+; CHECK-NEXT: mov w8, v0.s[1]
+; CHECK-NEXT: fmov w9, s0
; CHECK-NEXT: eor w0, w9, w8
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a)
@@ -207,12 +179,9 @@ define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: umin_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: uminv s2, v2.4s
+; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s
; CHECK-NEXT: uminv s0, v0.4s
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: cmp w9, w8
-; CHECK-NEXT: csel w0, w9, w8, lo
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
@@ -224,12 +193,9 @@ define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: umax_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: umaxv s2, v2.4s
+; CHECK-NEXT: umax v0.4s, v0.4s, v2.4s
; CHECK-NEXT: umaxv s0, v0.4s
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: cmp w9, w8
-; CHECK-NEXT: csel w0, w9, w8, hi
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
@@ -241,12 +207,9 @@ define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: smin_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: sminv s2, v2.4s
+; CHECK-NEXT: smin v0.4s, v0.4s, v2.4s
; CHECK-NEXT: sminv s0, v0.4s
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: cmp w9, w8
-; CHECK-NEXT: csel w0, w9, w8, lt
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
@@ -258,12 +221,9 @@ define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: smax_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: smaxv s2, v2.4s
+; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s
; CHECK-NEXT: smaxv s0, v0.4s
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: cmp w9, w8
-; CHECK-NEXT: csel w0, w9, w8, gt
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
index c79c87b29507..6a06d38e0671 100644
--- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
+++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -4,11 +4,11 @@
define float @add_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) {
; CHECK-LABEL: add_f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fadd z0.s, z0.s, z1.s
-; CHECK-NEXT: faddv s2, p0, z2.s
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fadd z0.s, z0.s, z2.s
; CHECK-NEXT: faddv s0, p0, z0.s
-; CHECK-NEXT: fadd s0, s0, s2
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fadd.f32.nxv8f32(float -0.0, <vscale x 8 x float> %a)
%r2 = call fast float @llvm.vector.reduce.fadd.f32.nxv4f32(float -0.0, <vscale x 4 x float> %b)
@@ -57,13 +57,12 @@ define float @fmax_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) {
define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: add_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: add z0.s, z0.s, z1.s
-; CHECK-NEXT: uaddv d2, p0, z2.s
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEXT: uaddv d0, p0, z0.s
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: fmov x9, d2
-; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -78,14 +77,13 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-NEXT: uunpklo z0.h, z0.b
; CHECK-NEXT: uunpkhi z3.h, z1.b
; CHECK-NEXT: uunpklo z1.h, z1.b
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: add z0.h, z0.h, z2.h
; CHECK-NEXT: add z1.h, z1.h, z3.h
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: uaddv d1, p0, z1.h
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
%ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
@@ -106,14 +104,13 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-NEXT: uunpklo z2.h, z2.b
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: add z1.h, z4.h, z3.h
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: add z0.h, z1.h, z0.h
; CHECK-NEXT: add z1.h, z2.h, z5.h
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: uaddv d0, p0, z0.h
-; CHECK-NEXT: uaddv d1, p0, z1.h
-; CHECK-NEXT: fmov x8, d0
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: add w0, w8, w9
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
%ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
@@ -133,13 +130,11 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
define i32 @and_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: and_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: and z0.d, z0.d, z1.d
-; CHECK-NEXT: andv s2, p0, z2.s
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: and z0.d, z0.d, z2.d
; CHECK-NEXT: andv s0, p0, z0.s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: and w0, w8, w9
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.and.i32.nxv8i32(<vscale x 8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.and.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -150,13 +145,11 @@ define i32 @and_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
define i32 @or_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: or_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: orr z0.d, z0.d, z1.d
-; CHECK-NEXT: orv s2, p0, z2.s
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: orr z0.d, z0.d, z2.d
; CHECK-NEXT: orv s0, p0, z0.s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: orr w0, w8, w9
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.or.i32.nxv8i32(<vscale x 8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.or.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -168,12 +161,9 @@ define i32 @xor_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-LABEL: xor_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: eor z0.d, z0.d, z1.d
-; CHECK-NEXT: eorv s2, p0, z2.s
+; CHECK-NEXT: eor3 z0.d, z0.d, z1.d, z2.d
; CHECK-NEXT: eorv s0, p0, z0.s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: eor w0, w8, w9
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.xor.i32.nxv8i32(<vscale x 8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.xor.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -186,12 +176,9 @@ define i32 @umin_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: uminv s2, p0, z2.s
+; CHECK-NEXT: umin z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: uminv s0, p0, z0.s
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: cmp w8, w9
-; CHECK-NEXT: csel w0, w8, w9, lo
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.umin.i32.nxv8i32(<vscale x 8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.umin.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -204,12 +191,9 @@ define i32 @umax_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: umaxv s2, p0, z2.s
+; CHECK-NEXT: umax z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: umaxv s0, p0, z0.s
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: cmp w8, w9
-; CHECK-NEXT: csel w0, w8, w9, hi
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.umax.i32.nxv8i32(<vscale x 8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.umax.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -222,12 +206,9 @@ define i32 @smin_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: sminv s2, p0, z2.s
+; CHECK-NEXT: smin z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: sminv s0, p0, z0.s
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: cmp w8, w9
-; CHECK-NEXT: csel w0, w8, w9, lt
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.smin.i32.nxv8i32(<vscale x 8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.smin.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -240,12 +221,9 @@ define i32 @smax_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT: smaxv s2, p0, z2.s
+; CHECK-NEXT: smax z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: smaxv s0, p0, z0.s
-; CHECK-NEXT: fmov w9, s2
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: cmp w8, w9
-; CHECK-NEXT: csel w0, w8, w9, gt
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.smax.i32.nxv8i32(<vscale x 8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.smax.i32.nxv4i32(<vscale x 4 x i32> %b)
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
index 0106dc2e7f7f..4183a83ed01b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce.ll
@@ -357,11 +357,11 @@ define double @fminv_nxv2f64(<vscale x 2 x double> %a) {
define float @fadd_reduct_reassoc_v4v8f32(<vscale x 4 x float> %a, <vscale x 8 x float> %b) {
; CHECK-LABEL: fadd_reduct_reassoc_v4v8f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fadd z1.s, z1.s, z2.s
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fadd z0.s, z0.s, z1.s
; CHECK-NEXT: faddv s0, p0, z0.s
-; CHECK-NEXT: faddv s1, p0, z1.s
-; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.0, <vscale x 4 x float> %a)
%r2 = call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.0, <vscale x 8 x float> %b)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 7b4da9ecf3cd..9e113be31488 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -1219,9 +1219,9 @@ entry:
define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uaddlp v0.2d, v0.4s
-; CHECK-NEXT: uadalp v0.2d, v1.4s
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: uaddlp v1.2d, v1.4s
+; CHECK-NEXT: uadalp v1.2d, v0.4s
+; CHECK-NEXT: addp d0, v1.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
@@ -1236,9 +1236,9 @@ entry:
define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: saddlp v0.2d, v0.4s
-; CHECK-NEXT: sadalp v0.2d, v1.4s
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: saddlp v1.2d, v1.4s
+; CHECK-NEXT: sadalp v1.2d, v0.4s
+; CHECK-NEXT: addp d0, v1.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
@@ -1285,9 +1285,9 @@ entry:
define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uaddlp v0.4s, v0.8h
-; CHECK-NEXT: uadalp v0.4s, v1.8h
-; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: uaddlp v1.4s, v1.8h
+; CHECK-NEXT: uadalp v1.4s, v0.8h
+; CHECK-NEXT: addv s0, v1.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
@@ -1302,9 +1302,9 @@ entry:
define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: saddlp v0.4s, v0.8h
-; CHECK-NEXT: sadalp v0.4s, v1.8h
-; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: saddlp v1.4s, v1.8h
+; CHECK-NEXT: sadalp v1.4s, v0.8h
+; CHECK-NEXT: addv s0, v1.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
@@ -1351,12 +1351,9 @@ entry:
define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_pair_v8i16_v8i16:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
; CHECK-NEXT: addv h0, v0.8h
-; CHECK-NEXT: addv h1, v1.8h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: and w0, w8, #0xffff
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
@@ -1420,11 +1417,11 @@ entry:
define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: uaddlp v0.2d, v0.4s
-; CHECK-NEXT: uadalp v0.2d, v1.4s
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: uaddlp v1.2d, v1.4s
+; CHECK-NEXT: uadalp v1.2d, v0.4s
+; CHECK-NEXT: addp d0, v1.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
@@ -1439,11 +1436,11 @@ entry:
define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: sshll v1.4s, v1.4h, #0
-; CHECK-NEXT: saddlp v0.2d, v0.4s
-; CHECK-NEXT: sadalp v0.2d, v1.4s
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: saddlp v1.2d, v1.4s
+; CHECK-NEXT: sadalp v1.2d, v0.4s
+; CHECK-NEXT: addp d0, v1.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
@@ -1570,11 +1567,11 @@ entry:
define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-BASE: // %bb.0: // %entry
-; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h
-; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT: addv s0, v0.4s
+; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT: uaddlp v1.4s, v1.8h
+; CHECK-BASE-NEXT: uadalp v1.4s, v0.8h
+; CHECK-BASE-NEXT: addv s0, v1.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
@@ -1599,11 +1596,11 @@ entry:
define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-BASE: // %bb.0: // %entry
-; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-BASE-NEXT: saddlp v0.4s, v0.8h
-; CHECK-BASE-NEXT: sadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT: addv s0, v0.4s
+; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT: saddlp v1.4s, v1.8h
+; CHECK-BASE-NEXT: sadalp v1.4s, v0.8h
+; CHECK-BASE-NEXT: addv s0, v1.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
@@ -1667,12 +1664,10 @@ entry:
define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uaddlv h0, v0.16b
-; CHECK-NEXT: uaddlv h1, v1.16b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: and w0, w8, #0xffff
+; CHECK-NEXT: uaddlp v1.8h, v1.16b
+; CHECK-NEXT: uadalp v1.8h, v0.16b
+; CHECK-NEXT: addv h0, v1.8h
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
@@ -1686,12 +1681,10 @@ entry:
define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: saddlv h0, v0.16b
-; CHECK-NEXT: saddlv h1, v1.16b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: sxth w0, w8
+; CHECK-NEXT: saddlp v1.8h, v1.16b
+; CHECK-NEXT: sadalp v1.8h, v0.16b
+; CHECK-NEXT: addv h0, v1.8h
+; CHECK-NEXT: smov w0, v0.h[0]
; CHECK-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
@@ -1705,14 +1698,9 @@ entry:
define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: addv h0, v0.8h
-; CHECK-NEXT: addv h1, v1.8h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: and w0, w8, #0xffff
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
@@ -1726,14 +1714,9 @@ entry:
define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sshll v0.8h, v0.8b, #0
-; CHECK-NEXT: sshll v1.8h, v1.8b, #0
+; CHECK-NEXT: saddl v0.8h, v0.8b, v1.8b
; CHECK-NEXT: addv h0, v0.8h
-; CHECK-NEXT: addv h1, v1.8h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: sxth w0, w8
+; CHECK-NEXT: smov w0, v0.h[0]
; CHECK-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
@@ -1747,12 +1730,9 @@ entry:
define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_pair_v16i8_v16i8:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add v0.16b, v0.16b, v1.16b
; CHECK-NEXT: addv b0, v0.16b
-; CHECK-NEXT: addv b1, v1.16b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: add w8, w8, w9
-; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
@@ -1904,13 +1884,13 @@ entry:
define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: bic v0.4h, #255, lsl #8
; CHECK-NEXT: bic v1.4h, #255, lsl #8
-; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: bic v0.4h, #255, lsl #8
; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: uaddlp v0.2d, v0.4s
-; CHECK-NEXT: uadalp v0.2d, v1.4s
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: uaddlp v1.2d, v1.4s
+; CHECK-NEXT: uadalp v1.2d, v0.4s
+; CHECK-NEXT: addp d0, v1.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
@@ -1995,15 +1975,15 @@ entry:
define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) {
; CHECK-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-BASE: // %bb.0: // %entry
-; CHECK-BASE-NEXT: sshll v2.8h, v2.8b, #0
-; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-BASE-NEXT: saddlp v2.4s, v2.8h
-; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h
-; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-BASE-NEXT: sshll v3.8h, v3.8b, #0
-; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT: sadalp v2.4s, v3.8h
-; CHECK-BASE-NEXT: add v0.4s, v0.4s, v2.4s
+; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-BASE-NEXT: saddlp v3.4s, v3.8h
+; CHECK-BASE-NEXT: uaddlp v1.4s, v1.8h
+; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-BASE-NEXT: sshll v2.8h, v2.8b, #0
+; CHECK-BASE-NEXT: uadalp v1.4s, v0.8h
+; CHECK-BASE-NEXT: sadalp v3.4s, v2.8h
+; CHECK-BASE-NEXT: add v0.4s, v3.4s, v1.4s
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
@@ -2091,48 +2071,48 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
; CHECK-BASE-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-BASE-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-BASE-NEXT: sxtw x8, w1
-; CHECK-BASE-NEXT: sxtw x9, w3
-; CHECK-BASE-NEXT: add x10, x0, x8
-; CHECK-BASE-NEXT: add x11, x2, x9
-; CHECK-BASE-NEXT: ldr d2, [x0]
-; CHECK-BASE-NEXT: ldr d3, [x2]
-; CHECK-BASE-NEXT: ldr d0, [x10]
-; CHECK-BASE-NEXT: add x10, x10, x8
-; CHECK-BASE-NEXT: ldr d1, [x11]
-; CHECK-BASE-NEXT: add x11, x11, x9
+; CHECK-BASE-NEXT: sxtw x10, w3
+; CHECK-BASE-NEXT: add x9, x0, x8
+; CHECK-BASE-NEXT: ldr d0, [x0]
+; CHECK-BASE-NEXT: ldr d1, [x2]
+; CHECK-BASE-NEXT: add x11, x2, x10
+; CHECK-BASE-NEXT: ldr d2, [x9]
+; CHECK-BASE-NEXT: add x9, x9, x8
; CHECK-BASE-NEXT: uabdl v0.8h, v0.8b, v1.8b
-; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b
-; CHECK-BASE-NEXT: ldr d2, [x10]
-; CHECK-BASE-NEXT: ldr d3, [x11]
-; CHECK-BASE-NEXT: add x10, x10, x8
+; CHECK-BASE-NEXT: ldr d1, [x11]
+; CHECK-BASE-NEXT: add x11, x11, x10
; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h
-; CHECK-BASE-NEXT: add x11, x11, x9
+; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v1.8b
+; CHECK-BASE-NEXT: ldr d2, [x9]
+; CHECK-BASE-NEXT: ldr d3, [x11]
+; CHECK-BASE-NEXT: add x9, x9, x8
+; CHECK-BASE-NEXT: add x11, x11, x10
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b
-; CHECK-BASE-NEXT: ldr d2, [x10]
+; CHECK-BASE-NEXT: ldr d2, [x9]
; CHECK-BASE-NEXT: ldr d3, [x11]
-; CHECK-BASE-NEXT: add x10, x10, x8
-; CHECK-BASE-NEXT: add x11, x11, x9
+; CHECK-BASE-NEXT: add x9, x9, x8
+; CHECK-BASE-NEXT: add x11, x11, x10
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b
-; CHECK-BASE-NEXT: ldr d2, [x10]
+; CHECK-BASE-NEXT: ldr d2, [x9]
; CHECK-BASE-NEXT: ldr d3, [x11]
-; CHECK-BASE-NEXT: add x10, x10, x8
-; CHECK-BASE-NEXT: add x11, x11, x9
+; CHECK-BASE-NEXT: add x9, x9, x8
+; CHECK-BASE-NEXT: add x11, x11, x10
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b
-; CHECK-BASE-NEXT: ldr d2, [x10]
+; CHECK-BASE-NEXT: ldr d2, [x9]
; CHECK-BASE-NEXT: ldr d3, [x11]
-; CHECK-BASE-NEXT: add x10, x10, x8
-; CHECK-BASE-NEXT: add x11, x11, x9
+; CHECK-BASE-NEXT: add x9, x9, x8
+; CHECK-BASE-NEXT: add x11, x11, x10
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: uabdl v1.8h, v2.8b, v3.8b
-; CHECK-BASE-NEXT: ldr d2, [x10]
+; CHECK-BASE-NEXT: ldr d2, [x9]
; CHECK-BASE-NEXT: ldr d3, [x11]
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
-; CHECK-BASE-NEXT: ldr d1, [x10, x8]
+; CHECK-BASE-NEXT: ldr d1, [x9, x8]
; CHECK-BASE-NEXT: uabdl v2.8h, v2.8b, v3.8b
-; CHECK-BASE-NEXT: ldr d3, [x11, x9]
+; CHECK-BASE-NEXT: ldr d3, [x11, x10]
; CHECK-BASE-NEXT: uadalp v0.4s, v2.8h
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v3.8b
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index 452fc36571ef..ba44bc99ce8c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -438,13 +438,10 @@ exit:
define half @fadd_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
; FULLFP16-LABEL: fadd_reduct_reassoc_v8f16:
; FULLFP16: // %bb.0:
-; FULLFP16-NEXT: faddp v2.8h, v0.8h, v0.8h
-; FULLFP16-NEXT: faddp v3.8h, v1.8h, v1.8h
-; FULLFP16-NEXT: faddp v0.8h, v2.8h, v0.8h
-; FULLFP16-NEXT: faddp v1.8h, v3.8h, v1.8h
+; FULLFP16-NEXT: fadd v0.8h, v0.8h, v1.8h
+; FULLFP16-NEXT: faddp v1.8h, v0.8h, v0.8h
+; FULLFP16-NEXT: faddp v0.8h, v1.8h, v0.8h
; FULLFP16-NEXT: faddp h0, v0.2h
-; FULLFP16-NEXT: faddp h1, v1.2h
-; FULLFP16-NEXT: fadd h0, h0, h1
; FULLFP16-NEXT: ret
;
; CHECKNOFP16-LABEL: fadd_reduct_reassoc_v8f16:
@@ -535,11 +532,9 @@ define float @fadd_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: fadd v2.4s, v2.4s, v3.4s
; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: faddp v1.4s, v2.4s, v2.4s
+; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: faddp s1, v1.2s
; CHECK-NEXT: faddp s0, v0.2s
-; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
%r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b)
@@ -550,11 +545,9 @@ define float @fadd_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
define float @fadd_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: fadd_reduct_reassoc_v4f32:
; CHECK: // %bb.0:
+; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
; CHECK-NEXT: faddp s0, v0.2s
-; CHECK-NEXT: faddp s1, v1.2s
-; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
%r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
@@ -582,11 +575,9 @@ define float @fadd_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
; CHECK-LABEL: fadd_reduct_reassoc_v4v8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fadd v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: faddp v1.4s, v1.4s, v1.4s
; CHECK-NEXT: faddp s0, v0.2s
-; CHECK-NEXT: faddp s1, v1.2s
-; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
%r2 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %b)
@@ -599,9 +590,8 @@ define double @fadd_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: fadd v2.2d, v2.2d, v3.2d
; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: faddp d1, v2.2d
+; CHECK-NEXT: fadd v0.2d, v0.2d, v2.2d
; CHECK-NEXT: faddp d0, v0.2d
-; CHECK-NEXT: fadd d0, d0, d1
; CHECK-NEXT: ret
%r1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %a)
%r2 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %b)
diff --git a/llvm/test/CodeGen/RISCV/double_reduct.ll b/llvm/test/CodeGen/RISCV/double_reduct.ll
index bd910f1230a7..d5bde8bad7c3 100644
--- a/llvm/test/CodeGen/RISCV/double_reduct.ll
+++ b/llvm/test/CodeGen/RISCV/double_reduct.ll
@@ -8,12 +8,10 @@ define float @add_f32(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: add_f32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmv.s.x v10, zero
-; CHECK-NEXT: vfredusum.vs v8, v8, v10
-; CHECK-NEXT: vfmv.f.s ft0, v8
-; CHECK-NEXT: vfredusum.vs v8, v9, v10
-; CHECK-NEXT: vfmv.f.s ft1, v8
-; CHECK-NEXT: fadd.s fa0, ft0, ft1
+; CHECK-NEXT: vfadd.vv v8, v8, v9
+; CHECK-NEXT: vmv.s.x v9, zero
+; CHECK-NEXT: vfredusum.vs v8, v8, v9
+; CHECK-NEXT: vfmv.f.s fa0, v8
; CHECK-NEXT: ret
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a)
%r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b)
@@ -83,27 +81,14 @@ define float @fmax_f32(<4 x float> %a, <4 x float> %b) {
define i32 @add_i32(<4 x i32> %a, <4 x i32> %b) {
-; RV32-LABEL: add_i32:
-; RV32: # %bb.0:
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.s.x v10, zero
-; RV32-NEXT: vredsum.vs v8, v8, v10
-; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vredsum.vs v8, v9, v10
-; RV32-NEXT: vmv.x.s a1, v8
-; RV32-NEXT: add a0, a0, a1
-; RV32-NEXT: ret
-;
-; RV64-LABEL: add_i32:
-; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vmv.s.x v10, zero
-; RV64-NEXT: vredsum.vs v8, v8, v10
-; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: vredsum.vs v8, v9, v10
-; RV64-NEXT: vmv.x.s a1, v8
-; RV64-NEXT: addw a0, a0, a1
-; RV64-NEXT: ret
+; CHECK-LABEL: add_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vmv.s.x v10, zero
+; CHECK-NEXT: vadd.vv v8, v8, v9
+; CHECK-NEXT: vredsum.vs v8, v8, v10
+; CHECK-NEXT: vmv.x.s a0, v8
+; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b)
%r = add i32 %r1, %r2
@@ -116,14 +101,10 @@ define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
; CHECK-NEXT: vsetivli zero, 16, e16, m1, ta, ma
; CHECK-NEXT: vmv.s.x v10, zero
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vwredsumu.vs v8, v8, v10
-; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma
+; CHECK-NEXT: vwaddu.vv v12, v8, v9
+; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT: vredsum.vs v8, v12, v10
; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT: vwredsumu.vs v8, v9, v10
-; CHECK-NEXT: vsetivli zero, 0, e16, m1, ta, ma
-; CHECK-NEXT: vmv.x.s a1, v8
-; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: ret
%ae = zext <16 x i8> %a to <16 x i16>
%be = zext <16 x i8> %b to <16 x i16>
@@ -200,12 +181,10 @@ define i32 @and_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: and_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v10, -1
-; CHECK-NEXT: vredand.vs v8, v8, v10
+; CHECK-NEXT: vand.vv v8, v8, v9
+; CHECK-NEXT: vmv.v.i v9, -1
+; CHECK-NEXT: vredand.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vredand.vs v8, v9, v10
-; CHECK-NEXT: vmv.x.s a1, v8
-; CHECK-NEXT: and a0, a0, a1
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
@@ -218,11 +197,9 @@ define i32 @or_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v10, zero
+; CHECK-NEXT: vor.vv v8, v8, v9
; CHECK-NEXT: vredor.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vredor.vs v8, v9, v10
-; CHECK-NEXT: vmv.x.s a1, v8
-; CHECK-NEXT: or a0, a0, a1
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
@@ -235,11 +212,9 @@ define i32 @xor_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v10, zero
+; CHECK-NEXT: vxor.vv v8, v8, v9
; CHECK-NEXT: vredxor.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vredxor.vs v8, v9, v10
-; CHECK-NEXT: vmv.x.s a1, v8
-; CHECK-NEXT: xor a0, a0, a1
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
@@ -251,15 +226,10 @@ define i32 @umin_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: umin_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v10, -1
-; CHECK-NEXT: vredminu.vs v8, v8, v10
+; CHECK-NEXT: vminu.vv v8, v8, v9
+; CHECK-NEXT: vmv.v.i v9, -1
+; CHECK-NEXT: vredminu.vs v8, v8, v9
; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vredminu.vs v8, v9, v10
-; CHECK-NEXT: vmv.x.s a1, v8
-; CHECK-NEXT: bltu a0, a1, .LBB11_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB11_2:
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
@@ -272,14 +242,9 @@ define i32 @umax_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v10, zero
+; CHECK-NEXT: vmaxu.vv v8, v8, v9
; CHECK-NEXT: vredmaxu.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vredmaxu.vs v8, v9, v10
-; CHECK-NEXT: vmv.x.s a1, v8
-; CHECK-NEXT: bltu a1, a0, .LBB12_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB12_2:
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
@@ -290,34 +255,24 @@ define i32 @umax_i32(<4 x i32> %a, <4 x i32> %b) {
define i32 @smin_i32(<4 x i32> %a, <4 x i32> %b) {
; RV32-LABEL: smin_i32:
; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT: vmin.vv v8, v8, v9
; RV32-NEXT: lui a0, 524288
; RV32-NEXT: addi a0, a0, -1
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.s.x v10, a0
-; RV32-NEXT: vredmin.vs v8, v8, v10
+; RV32-NEXT: vmv.s.x v9, a0
+; RV32-NEXT: vredmin.vs v8, v8, v9
; RV32-NEXT: vmv.x.s a0, v8
-; RV32-NEXT: vredmin.vs v8, v9, v10
-; RV32-NEXT: vmv.x.s a1, v8
-; RV32-NEXT: blt a0, a1, .LBB13_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: mv a0, a1
-; RV32-NEXT: .LBB13_2:
; RV32-NEXT: ret
;
; RV64-LABEL: smin_i32:
; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT: vmin.vv v8, v8, v9
; RV64-NEXT: lui a0, 524288
; RV64-NEXT: addiw a0, a0, -1
-; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT: vmv.s.x v10, a0
-; RV64-NEXT: vredmin.vs v8, v8, v10
+; RV64-NEXT: vmv.s.x v9, a0
+; RV64-NEXT: vredmin.vs v8, v8, v9
; RV64-NEXT: vmv.x.s a0, v8
-; RV64-NEXT: vredmin.vs v8, v9, v10
-; RV64-NEXT: vmv.x.s a1, v8
-; RV64-NEXT: blt a0, a1, .LBB13_2
-; RV64-NEXT: # %bb.1:
-; RV64-NEXT: mv a0, a1
-; RV64-NEXT: .LBB13_2:
; RV64-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
@@ -331,14 +286,9 @@ define i32 @smax_i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-NEXT: lui a0, 524288
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vmv.s.x v10, a0
+; CHECK-NEXT: vmax.vv v8, v8, v9
; CHECK-NEXT: vredmax.vs v8, v8, v10
; CHECK-NEXT: vmv.x.s a0, v8
-; CHECK-NEXT: vredmax.vs v8, v9, v10
-; CHECK-NEXT: vmv.x.s a1, v8
-; CHECK-NEXT: blt a1, a0, .LBB14_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: mv a0, a1
-; CHECK-NEXT: .LBB14_2:
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
diff --git a/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll b/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll
index 975f7b43067b..fad110df937f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-doublereduct.ll
@@ -5,12 +5,9 @@ define float @add_f32(<8 x float> %a, <4 x float> %b) {
; CHECK-LABEL: add_f32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vadd.f32 q0, q0, q1
-; CHECK-NEXT: vadd.f32 s4, s10, s11
+; CHECK-NEXT: vadd.f32 q0, q0, q2
; CHECK-NEXT: vadd.f32 s2, s2, s3
; CHECK-NEXT: vadd.f32 s0, s0, s1
-; CHECK-NEXT: vadd.f32 s6, s8, s9
-; CHECK-NEXT: vadd.f32 s0, s0, s2
-; CHECK-NEXT: vadd.f32 s2, s6, s4
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: bx lr
%r1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %a)
@@ -23,12 +20,9 @@ define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
; CHECK-LABEL: fmul_f32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmul.f32 q0, q0, q1
-; CHECK-NEXT: vmul.f32 s4, s10, s11
+; CHECK-NEXT: vmul.f32 q0, q0, q2
; CHECK-NEXT: vmul.f32 s2, s2, s3
; CHECK-NEXT: vmul.f32 s0, s0, s1
-; CHECK-NEXT: vmul.f32 s6, s8, s9
-; CHECK-NEXT: vmul.f32 s0, s0, s2
-; CHECK-NEXT: vmul.f32 s2, s6, s4
; CHECK-NEXT: vmul.f32 s0, s0, s2
; CHECK-NEXT: bx lr
%r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
@@ -132,21 +126,14 @@ define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) {
define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: mul_i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, lr}
-; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: vmul.i32 q0, q0, q1
-; CHECK-NEXT: vmov r0, r1, d5
-; CHECK-NEXT: vmov r6, r3, d0
-; CHECK-NEXT: vmov r12, lr, d1
-; CHECK-NEXT: vmov r4, r5, d4
+; CHECK-NEXT: vmul.i32 q0, q0, q2
+; CHECK-NEXT: vmov r0, r1, d1
+; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: mul r2, r12, lr
-; CHECK-NEXT: muls r3, r6, r3
-; CHECK-NEXT: mul r1, r4, r5
-; CHECK-NEXT: muls r2, r3, r2
+; CHECK-NEXT: mul r1, r2, r3
; CHECK-NEXT: muls r0, r1, r0
-; CHECK-NEXT: muls r0, r2, r0
-; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: bx lr
%r1 = call i32 @llvm.vector.reduce.mul.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b)
%r = mul i32 %r1, %r2
@@ -156,21 +143,14 @@ define i32 @mul_i32(<8 x i32> %a, <4 x i32> %b) {
define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: and_i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, lr}
-; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: vand q0, q0, q1
-; CHECK-NEXT: vmov r6, r1, d5
+; CHECK-NEXT: vand q0, q0, q2
+; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
-; CHECK-NEXT: vmov r12, lr, d1
-; CHECK-NEXT: vmov r4, r5, d4
-; CHECK-NEXT: ands r1, r6
-; CHECK-NEXT: ands r2, r3
-; CHECK-NEXT: and.w r0, r12, lr
-; CHECK-NEXT: ands r0, r2
-; CHECK-NEXT: and.w r2, r4, r5
-; CHECK-NEXT: ands r1, r2
; CHECK-NEXT: ands r0, r1
-; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: and.w r1, r2, r3
+; CHECK-NEXT: ands r0, r1
+; CHECK-NEXT: bx lr
%r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
%r = and i32 %r1, %r2
@@ -180,21 +160,14 @@ define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) {
define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: or_i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, lr}
-; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: vorr q0, q0, q1
-; CHECK-NEXT: vmov r6, r1, d5
+; CHECK-NEXT: vorr q0, q0, q2
+; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
-; CHECK-NEXT: vmov r12, lr, d1
-; CHECK-NEXT: vmov r4, r5, d4
-; CHECK-NEXT: orrs r1, r6
-; CHECK-NEXT: orrs r2, r3
-; CHECK-NEXT: orr.w r0, r12, lr
-; CHECK-NEXT: orrs r0, r2
-; CHECK-NEXT: orr.w r2, r4, r5
-; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: orrs r0, r1
-; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: orr.w r1, r2, r3
+; CHECK-NEXT: orrs r0, r1
+; CHECK-NEXT: bx lr
%r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
%r = or i32 %r1, %r2
@@ -204,21 +177,14 @@ define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) {
define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: xor_i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, lr}
-; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: veor q0, q0, q1
-; CHECK-NEXT: vmov r6, r1, d5
+; CHECK-NEXT: veor q0, q0, q2
+; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
-; CHECK-NEXT: vmov r12, lr, d1
-; CHECK-NEXT: vmov r4, r5, d4
-; CHECK-NEXT: eors r1, r6
-; CHECK-NEXT: eors r2, r3
-; CHECK-NEXT: eor.w r0, r12, lr
-; CHECK-NEXT: eors r0, r2
-; CHECK-NEXT: eor.w r2, r4, r5
-; CHECK-NEXT: eors r1, r2
; CHECK-NEXT: eors r0, r1
-; CHECK-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NEXT: eor.w r1, r2, r3
+; CHECK-NEXT: eors r0, r1
+; CHECK-NEXT: bx lr
%r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
%r = xor i32 %r1, %r2
@@ -228,13 +194,10 @@ define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) {
define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: umin_i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mov.w r0, #-1
; CHECK-NEXT: vmin.u32 q0, q0, q1
-; CHECK-NEXT: mov.w r1, #-1
-; CHECK-NEXT: vminv.u32 r0, q2
-; CHECK-NEXT: vminv.u32 r1, q0
-; CHECK-NEXT: cmp r1, r0
-; CHECK-NEXT: csel r0, r1, r0, lo
+; CHECK-NEXT: mov.w r0, #-1
+; CHECK-NEXT: vmin.u32 q0, q0, q2
+; CHECK-NEXT: vminv.u32 r0, q0
; CHECK-NEXT: bx lr
%r1 = call i32 @llvm.vector.reduce.umin.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b)
@@ -245,13 +208,10 @@ define i32 @umin_i32(<8 x i32> %a, <4 x i32> %b) {
define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: umax_i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: vmax.u32 q0, q0, q1
-; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: vmaxv.u32 r0, q2
-; CHECK-NEXT: vmaxv.u32 r1, q0
-; CHECK-NEXT: cmp r1, r0
-; CHECK-NEXT: csel r0, r1, r0, hi
+; CHECK-NEXT: movs r0, #0
+; CHECK-NEXT: vmax.u32 q0, q0, q2
+; CHECK-NEXT: vmaxv.u32 r0, q0
; CHECK-NEXT: bx lr
%r1 = call i32 @llvm.vector.reduce.umax.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b)
@@ -262,13 +222,10 @@ define i32 @umax_i32(<8 x i32> %a, <4 x i32> %b) {
define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: smin_i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mvn r0, #-2147483648
; CHECK-NEXT: vmin.s32 q0, q0, q1
-; CHECK-NEXT: mvn r1, #-2147483648
-; CHECK-NEXT: vminv.s32 r0, q2
-; CHECK-NEXT: vminv.s32 r1, q0
-; CHECK-NEXT: cmp r1, r0
-; CHECK-NEXT: csel r0, r1, r0, lt
+; CHECK-NEXT: mvn r0, #-2147483648
+; CHECK-NEXT: vmin.s32 q0, q0, q2
+; CHECK-NEXT: vminv.s32 r0, q0
; CHECK-NEXT: bx lr
%r1 = call i32 @llvm.vector.reduce.smin.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b)
@@ -279,13 +236,10 @@ define i32 @smin_i32(<8 x i32> %a, <4 x i32> %b) {
define i32 @smax_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: smax_i32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: mov.w r0, #-2147483648
; CHECK-NEXT: vmax.s32 q0, q0, q1
-; CHECK-NEXT: mov.w r1, #-2147483648
-; CHECK-NEXT: vmaxv.s32 r0, q2
-; CHECK-NEXT: vmaxv.s32 r1, q0
-; CHECK-NEXT: cmp r1, r0
-; CHECK-NEXT: csel r0, r1, r0, gt
+; CHECK-NEXT: mov.w r0, #-2147483648
+; CHECK-NEXT: vmax.s32 q0, q0, q2
+; CHECK-NEXT: vmaxv.s32 r0, q0
; CHECK-NEXT: bx lr
%r1 = call i32 @llvm.vector.reduce.smax.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b)
More information about the llvm-commits
mailing list