[llvm] 8e46ac3 - [AArch64] Add more efficient bitwise vector reductions.
David Green via llvm-commits
llvm-commits at lists.llvm.org
Wed May 3 07:56:22 PDT 2023
Author: Sp00ph
Date: 2023-05-03T15:56:16+01:00
New Revision: 8e46ac3623b4dfbd7127fc0deee5cf82a3f9d472
URL: https://github.com/llvm/llvm-project/commit/8e46ac3623b4dfbd7127fc0deee5cf82a3f9d472
DIFF: https://github.com/llvm/llvm-project/commit/8e46ac3623b4dfbd7127fc0deee5cf82a3f9d472.diff
LOG: [AArch64] Add more efficient bitwise vector reductions.
Improves the codegen for VECREDUCE_{AND,OR,XOR} operations on AArch64.
Currently, these are fully scalarized, except if the vector is a <N x i1>. This
patch improves the codegen down to O(log(N)) where N is the length of the
vector for vectors whose elements are not i1, by repeatedly applying the
bitwise operations to the two halves of the vector. <N x i1> bitwise reductions
are handled using VECREDUCE_{UMAX,UMIN,ADD} instead.
I had to update quite a few codegen tests with these changes, with a general
downward trend in instruction count. Since the vector reductions already have
tests, I haven't added any new tests myself.
Differential Revision: https://reviews.llvm.org/D148185
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
llvm/test/CodeGen/AArch64/double_reduct.ll
llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
llvm/test/CodeGen/AArch64/reduce-and.ll
llvm/test/CodeGen/AArch64/reduce-or.ll
llvm/test/CodeGen/AArch64/reduce-xor.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll
llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ae1969b03ce4..f55e269e4dd6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1001,6 +1001,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::CTLZ);
+ setTargetDAGCombine(ISD::VECREDUCE_AND);
+ setTargetDAGCombine(ISD::VECREDUCE_OR);
+ setTargetDAGCombine(ISD::VECREDUCE_XOR);
+
// In case of strict alignment, avoid an excessive number of byte wide stores.
MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemset =
@@ -1165,8 +1169,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
}
setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
+ setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
@@ -13306,6 +13316,106 @@ static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
DAG.getConstant(0, DL, MVT::i64));
}
+static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
+ SDLoc DL, SelectionDAG &DAG) {
+ unsigned ScalarOpcode;
+ switch (Opcode) {
+ case ISD::VECREDUCE_AND:
+ ScalarOpcode = ISD::AND;
+ break;
+ case ISD::VECREDUCE_OR:
+ ScalarOpcode = ISD::OR;
+ break;
+ case ISD::VECREDUCE_XOR:
+ ScalarOpcode = ISD::XOR;
+ break;
+ default:
+ llvm_unreachable("Expected bitwise vector reduction");
+ return SDValue();
+ }
+
+ EVT VecVT = Vec.getValueType();
+ assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
+ "Expected power-of-2 length vector");
+
+ EVT ElemVT = VecVT.getVectorElementType();
+
+ SDValue Result;
+ unsigned NumElems = VecVT.getVectorNumElements();
+
+ // Special case for boolean reductions
+ if (ElemVT == MVT::i1) {
+ // Split large vectors into smaller ones
+ if (NumElems > 16) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
+ EVT HalfVT = Lo.getValueType();
+ SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
+ return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
+ }
+
+ // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
+ // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
+ // this element size leads to the best codegen, since e.g. setcc results
+ // might need to be truncated otherwise.
+ EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
+
+ // any_ext doesn't work with umin/umax, so only use it for uadd.
+ unsigned ExtendOp =
+ ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
+ SDValue Extended = DAG.getNode(
+ ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
+ switch (ScalarOpcode) {
+ case ISD::AND:
+ Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
+ break;
+ case ISD::OR:
+ Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
+ break;
+ case ISD::XOR:
+ Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
+ break;
+ default:
+ llvm_unreachable("Unexpected Opcode");
+ }
+
+ Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
+ } else {
+ // Iteratively split the vector in half and combine using the bitwise
+ // operation until it fits in a 64 bit register.
+ while (VecVT.getSizeInBits() > 64) {
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
+ VecVT = Lo.getValueType();
+ NumElems = VecVT.getVectorNumElements();
+ Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
+ }
+
+ EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
+
+ // Do the remaining work on a scalar since it allows the code generator to
+ // combine the shift and bitwise operation into one instruction and since
+ // integer instructions can have higher throughput than vector instructions.
+ SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
+
+ // Iteratively combine the lower and upper halves of the scalar using the
+ // bitwise operation, halving the relevant region of the scalar in each
+ // iteration, until the relevant region is just one element of the original
+ // vector.
+ for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
+ SDValue ShiftAmount =
+ DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
+ SDValue Shifted =
+ DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
+ Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
+ }
+
+ Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
+ }
+
+ return DAG.getAnyExtOrTrunc(Result, DL, VT);
+}
+
SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
@@ -13357,6 +13467,11 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
// Lower NEON reductions.
SDLoc dl(Op);
switch (Op.getOpcode()) {
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
+ return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
+ Op.getValueType(), dl, DAG);
case ISD::VECREDUCE_ADD:
return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
case ISD::VECREDUCE_SMAX:
@@ -20892,6 +21007,22 @@ static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
Op0ExtV, Op1ExtV, Op->getOperand(2));
}
+static SDValue
+performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ SDValue Vec = N->getOperand(0);
+ if (DCI.isBeforeLegalize() &&
+ Vec.getValueType().getVectorElementType() == MVT::i1 &&
+ Vec.getValueType().isFixedLengthVector() &&
+ Vec.getValueType().isPow2VectorType()) {
+ SDLoc DL(N);
+ return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
+ DAG);
+ }
+
+ return SDValue();
+}
+
static SDValue performSETCCCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -22060,6 +22191,10 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
default:
LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
break;
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
+ return performVecReduceBitwiseCombine(N, DCI, DAG);
case ISD::ADD:
case ISD::SUB:
return performAddSubCombine(N, DCI, DAG);
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
index 0bbe869c5da0..71b4ed3880e6 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
@@ -193,10 +193,10 @@ define i1 @combine_setcc_ne_vecreduce_and_v8i1(<8 x i8> %a) {
; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v8i1:
; CHECK: // %bb.0:
; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b
+; CHECK-NEXT: mov w8, #1 // =0x1
; CHECK-NEXT: uminv b0, v0.8b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: bic w0, w8, w9
; CHECK-NEXT: ret
%cmp1 = icmp ne <8 x i8> %a, zeroinitializer
%cast = bitcast <8 x i1> %cmp1 to i8
@@ -208,10 +208,10 @@ define i1 @combine_setcc_ne_vecreduce_and_v16i1(<16 x i8> %a) {
; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v16i1:
; CHECK: // %bb.0:
; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b
+; CHECK-NEXT: mov w8, #1 // =0x1
; CHECK-NEXT: uminv b0, v0.16b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: bic w0, w8, w9
; CHECK-NEXT: ret
%cmp1 = icmp ne <16 x i8> %a, zeroinitializer
%cast = bitcast <16 x i1> %cmp1 to i16
@@ -223,12 +223,14 @@ define i1 @combine_setcc_ne_vecreduce_and_v32i1(<32 x i8> %a) {
; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v32i1:
; CHECK: // %bb.0:
; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b
+; CHECK-NEXT: mov w8, #1 // =0x1
; CHECK-NEXT: cmeq v1.16b, v1.16b, #0
; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: uminv b0, v0.16b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: bic w0, w8, w9
; CHECK-NEXT: ret
%cmp1 = icmp ne <32 x i8> %a, zeroinitializer
%cast = bitcast <32 x i1> %cmp1 to i32
@@ -240,16 +242,18 @@ define i1 @combine_setcc_ne_vecreduce_and_v64i1(<64 x i8> %a) {
; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v64i1:
; CHECK: // %bb.0:
; CHECK-NEXT: cmtst v1.16b, v1.16b, v1.16b
+; CHECK-NEXT: mov w8, #1 // =0x1
; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b
; CHECK-NEXT: cmeq v3.16b, v3.16b, #0
; CHECK-NEXT: cmeq v2.16b, v2.16b, #0
; CHECK-NEXT: bic v1.16b, v1.16b, v3.16b
; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: uminv b0, v0.16b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: bic w0, w8, w9
; CHECK-NEXT: ret
%cmp1 = icmp ne <64 x i8> %a, zeroinitializer
%cast = bitcast <64 x i1> %cmp1 to i64
diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll
index 78408ae10e0b..b2dfe3f160e3 100644
--- a/llvm/test/CodeGen/AArch64/double_reduct.ll
+++ b/llvm/test/CodeGen/AArch64/double_reduct.ll
@@ -131,9 +131,9 @@ define i32 @and_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: and w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b)
@@ -148,9 +148,9 @@ define i32 @or_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: orr w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b)
@@ -165,9 +165,9 @@ define i32 @xor_i32(<8 x i32> %a, <4 x i32> %b) {
; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: eor w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: eor w0, w8, w9
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b)
diff --git a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
index 278a36d1a66f..f098b1e7b620 100644
--- a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
+++ b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
@@ -59,12 +59,14 @@ define i1 @unordered_floating_point_compare_on_v32f32(<32 x float> %a_vec) {
; CHECK-NEXT: fcmgt v4.4s, v4.4s, #0.0
; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: uzp1 v6.8h, v6.8h, v7.8h
-; CHECK-NEXT: uzp1 v1.8h, v4.8h, v5.8h
+; CHECK-NEXT: uzp1 v1.8h, v6.8h, v7.8h
+; CHECK-NEXT: uzp1 v3.8h, v4.8h, v5.8h
; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: uzp1 v1.16b, v1.16b, v6.16b
+; CHECK-NEXT: uzp1 v1.16b, v3.16b, v1.16b
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: umaxv b0, v0.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: bic w0, w9, w8
diff --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll
index 66eb3080f14e..71b3f1d24799 100644
--- a/llvm/test/CodeGen/AArch64/reduce-and.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-and.ll
@@ -256,9 +256,14 @@ define i8 @test_redand_v1i8(<1 x i8> %a) {
define i8 @test_redand_v3i8(<3 x i8> %a) {
; CHECK-LABEL: test_redand_v3i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, w1
-; CHECK-NEXT: and w8, w8, w2
-; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: movi d0, #0xff00ff00ff00ff
+; CHECK-NEXT: mov v0.h[0], w0
+; CHECK-NEXT: mov v0.h[1], w1
+; CHECK-NEXT: mov v0.h[2], w2
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redand_v3i8:
@@ -273,14 +278,10 @@ define i8 @test_redand_v3i8(<3 x i8> %a) {
define i8 @test_redand_v4i8(<4 x i8> %a) {
; CHECK-LABEL: test_redand_v4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.h[3]
-; CHECK-NEXT: umov w9, v0.h[2]
-; CHECK-NEXT: umov w10, v0.h[1]
-; CHECK-NEXT: umov w11, v0.h[0]
-; CHECK-NEXT: and w8, w9, w8
-; CHECK-NEXT: and w10, w11, w10
-; CHECK-NEXT: and w0, w10, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redand_v4i8:
@@ -304,22 +305,11 @@ define i8 @test_redand_v4i8(<4 x i8> %a) {
define i8 @test_redand_v8i8(<8 x i8> %a) {
; CHECK-LABEL: test_redand_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.b[5]
-; CHECK-NEXT: umov w9, v0.b[4]
-; CHECK-NEXT: umov w10, v0.b[1]
-; CHECK-NEXT: umov w11, v0.b[0]
-; CHECK-NEXT: umov w12, v0.b[3]
-; CHECK-NEXT: umov w13, v0.b[2]
-; CHECK-NEXT: umov w14, v0.b[6]
-; CHECK-NEXT: umov w15, v0.b[7]
-; CHECK-NEXT: and w8, w9, w8
-; CHECK-NEXT: and w10, w11, w10
-; CHECK-NEXT: and w11, w13, w12
-; CHECK-NEXT: and w9, w10, w11
-; CHECK-NEXT: and w8, w8, w14
-; CHECK-NEXT: and w8, w9, w8
-; CHECK-NEXT: and w0, w8, w15
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x8, x8, x8, lsr #32
+; CHECK-NEXT: and x8, x8, x8, lsr #16
+; CHECK-NEXT: lsr x9, x8, #8
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redand_v8i8:
@@ -357,20 +347,10 @@ define i8 @test_redand_v16i8(<16 x i8> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w8, v0.b[1]
-; CHECK-NEXT: umov w9, v0.b[0]
-; CHECK-NEXT: umov w10, v0.b[2]
-; CHECK-NEXT: umov w11, v0.b[3]
-; CHECK-NEXT: umov w12, v0.b[4]
-; CHECK-NEXT: umov w13, v0.b[5]
-; CHECK-NEXT: umov w14, v0.b[6]
-; CHECK-NEXT: and w8, w9, w8
-; CHECK-NEXT: umov w9, v0.b[7]
-; CHECK-NEXT: and w10, w10, w11
-; CHECK-NEXT: and w11, w12, w13
-; CHECK-NEXT: and w8, w8, w10
-; CHECK-NEXT: and w10, w11, w14
-; CHECK-NEXT: and w8, w8, w10
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x8, x8, x8, lsr #32
+; CHECK-NEXT: and x8, x8, x8, lsr #16
+; CHECK-NEXT: lsr x9, x8, #8
; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
;
@@ -411,20 +391,10 @@ define i8 @test_redand_v32i8(<32 x i8> %a) {
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w8, v0.b[1]
-; CHECK-NEXT: umov w9, v0.b[0]
-; CHECK-NEXT: umov w10, v0.b[2]
-; CHECK-NEXT: umov w11, v0.b[3]
-; CHECK-NEXT: umov w12, v0.b[4]
-; CHECK-NEXT: umov w13, v0.b[5]
-; CHECK-NEXT: umov w14, v0.b[6]
-; CHECK-NEXT: and w8, w9, w8
-; CHECK-NEXT: umov w9, v0.b[7]
-; CHECK-NEXT: and w10, w10, w11
-; CHECK-NEXT: and w11, w12, w13
-; CHECK-NEXT: and w8, w8, w10
-; CHECK-NEXT: and w10, w11, w14
-; CHECK-NEXT: and w8, w8, w10
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x8, x8, x8, lsr #32
+; CHECK-NEXT: and x8, x8, x8, lsr #16
+; CHECK-NEXT: lsr x9, x8, #8
; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
;
@@ -463,14 +433,10 @@ define i8 @test_redand_v32i8(<32 x i8> %a) {
define i16 @test_redand_v4i16(<4 x i16> %a) {
; CHECK-LABEL: test_redand_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.h[3]
-; CHECK-NEXT: umov w9, v0.h[2]
-; CHECK-NEXT: umov w10, v0.h[1]
-; CHECK-NEXT: umov w11, v0.h[0]
-; CHECK-NEXT: and w8, w9, w8
-; CHECK-NEXT: and w10, w11, w10
-; CHECK-NEXT: and w0, w10, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redand_v4i16:
@@ -496,12 +462,9 @@ define i16 @test_redand_v8i16(<8 x i16> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w8, v0.h[1]
-; CHECK-NEXT: umov w9, v0.h[0]
-; CHECK-NEXT: umov w10, v0.h[2]
-; CHECK-NEXT: umov w11, v0.h[3]
-; CHECK-NEXT: and w8, w9, w8
-; CHECK-NEXT: and w9, w10, w11
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
;
@@ -530,12 +493,9 @@ define i16 @test_redand_v16i16(<16 x i16> %a) {
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w8, v0.h[1]
-; CHECK-NEXT: umov w9, v0.h[0]
-; CHECK-NEXT: umov w10, v0.h[2]
-; CHECK-NEXT: umov w11, v0.h[3]
-; CHECK-NEXT: and w8, w9, w8
-; CHECK-NEXT: and w9, w10, w11
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
;
@@ -562,10 +522,9 @@ define i16 @test_redand_v16i16(<16 x i16> %a) {
define i32 @test_redand_v2i32(<2 x i32> %a) {
; CHECK-LABEL: test_redand_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: and w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redand_v2i32:
@@ -585,9 +544,9 @@ define i32 @test_redand_v4i32(<4 x i32> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: and w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redand_v4i32:
@@ -609,9 +568,9 @@ define i32 @test_redand_v8i32(<8 x i32> %a) {
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: and w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redand_v8i32:
diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll
index a44dcee3978b..591182018164 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or.ll
@@ -256,8 +256,16 @@ define i8 @test_redor_v1i8(<1 x i8> %a) {
define i8 @test_redor_v3i8(<3 x i8> %a) {
; CHECK-LABEL: test_redor_v3i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: orr w8, w0, w1
-; CHECK-NEXT: orr w0, w8, w2
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: mov v0.h[0], w0
+; CHECK-NEXT: mov v0.h[1], w1
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: mov v0.h[2], w2
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: lsr x10, x9, #32
+; CHECK-NEXT: lsr x9, x9, #16
+; CHECK-NEXT: orr w8, w8, w10
+; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redor_v3i8:
@@ -272,14 +280,10 @@ define i8 @test_redor_v3i8(<3 x i8> %a) {
define i8 @test_redor_v4i8(<4 x i8> %a) {
; CHECK-LABEL: test_redor_v4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.h[3]
-; CHECK-NEXT: umov w9, v0.h[2]
-; CHECK-NEXT: umov w10, v0.h[1]
-; CHECK-NEXT: umov w11, v0.h[0]
-; CHECK-NEXT: orr w8, w9, w8
-; CHECK-NEXT: orr w10, w11, w10
-; CHECK-NEXT: orr w0, w10, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: orr x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
+; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redor_v4i8:
@@ -303,22 +307,11 @@ define i8 @test_redor_v4i8(<4 x i8> %a) {
define i8 @test_redor_v8i8(<8 x i8> %a) {
; CHECK-LABEL: test_redor_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.b[5]
-; CHECK-NEXT: umov w9, v0.b[4]
-; CHECK-NEXT: umov w10, v0.b[1]
-; CHECK-NEXT: umov w11, v0.b[0]
-; CHECK-NEXT: umov w12, v0.b[3]
-; CHECK-NEXT: umov w13, v0.b[2]
-; CHECK-NEXT: umov w14, v0.b[6]
-; CHECK-NEXT: umov w15, v0.b[7]
-; CHECK-NEXT: orr w8, w9, w8
-; CHECK-NEXT: orr w10, w11, w10
-; CHECK-NEXT: orr w11, w13, w12
-; CHECK-NEXT: orr w9, w10, w11
-; CHECK-NEXT: orr w8, w8, w14
-; CHECK-NEXT: orr w8, w9, w8
-; CHECK-NEXT: orr w0, w8, w15
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: orr x8, x8, x8, lsr #32
+; CHECK-NEXT: orr x8, x8, x8, lsr #16
+; CHECK-NEXT: lsr x9, x8, #8
+; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redor_v8i8:
@@ -356,20 +349,10 @@ define i8 @test_redor_v16i8(<16 x i8> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w8, v0.b[1]
-; CHECK-NEXT: umov w9, v0.b[0]
-; CHECK-NEXT: umov w10, v0.b[2]
-; CHECK-NEXT: umov w11, v0.b[3]
-; CHECK-NEXT: umov w12, v0.b[4]
-; CHECK-NEXT: umov w13, v0.b[5]
-; CHECK-NEXT: umov w14, v0.b[6]
-; CHECK-NEXT: orr w8, w9, w8
-; CHECK-NEXT: umov w9, v0.b[7]
-; CHECK-NEXT: orr w10, w10, w11
-; CHECK-NEXT: orr w11, w12, w13
-; CHECK-NEXT: orr w8, w8, w10
-; CHECK-NEXT: orr w10, w11, w14
-; CHECK-NEXT: orr w8, w8, w10
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: orr x8, x8, x8, lsr #32
+; CHECK-NEXT: orr x8, x8, x8, lsr #16
+; CHECK-NEXT: lsr x9, x8, #8
; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
;
@@ -410,20 +393,10 @@ define i8 @test_redor_v32i8(<32 x i8> %a) {
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w8, v0.b[1]
-; CHECK-NEXT: umov w9, v0.b[0]
-; CHECK-NEXT: umov w10, v0.b[2]
-; CHECK-NEXT: umov w11, v0.b[3]
-; CHECK-NEXT: umov w12, v0.b[4]
-; CHECK-NEXT: umov w13, v0.b[5]
-; CHECK-NEXT: umov w14, v0.b[6]
-; CHECK-NEXT: orr w8, w9, w8
-; CHECK-NEXT: umov w9, v0.b[7]
-; CHECK-NEXT: orr w10, w10, w11
-; CHECK-NEXT: orr w11, w12, w13
-; CHECK-NEXT: orr w8, w8, w10
-; CHECK-NEXT: orr w10, w11, w14
-; CHECK-NEXT: orr w8, w8, w10
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: orr x8, x8, x8, lsr #32
+; CHECK-NEXT: orr x8, x8, x8, lsr #16
+; CHECK-NEXT: lsr x9, x8, #8
; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
;
@@ -462,14 +435,10 @@ define i8 @test_redor_v32i8(<32 x i8> %a) {
define i16 @test_redor_v4i16(<4 x i16> %a) {
; CHECK-LABEL: test_redor_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.h[3]
-; CHECK-NEXT: umov w9, v0.h[2]
-; CHECK-NEXT: umov w10, v0.h[1]
-; CHECK-NEXT: umov w11, v0.h[0]
-; CHECK-NEXT: orr w8, w9, w8
-; CHECK-NEXT: orr w10, w11, w10
-; CHECK-NEXT: orr w0, w10, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: orr x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
+; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redor_v4i16:
@@ -495,12 +464,9 @@ define i16 @test_redor_v8i16(<8 x i16> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w8, v0.h[1]
-; CHECK-NEXT: umov w9, v0.h[0]
-; CHECK-NEXT: umov w10, v0.h[2]
-; CHECK-NEXT: umov w11, v0.h[3]
-; CHECK-NEXT: orr w8, w9, w8
-; CHECK-NEXT: orr w9, w10, w11
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: orr x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
;
@@ -529,12 +495,9 @@ define i16 @test_redor_v16i16(<16 x i16> %a) {
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w8, v0.h[1]
-; CHECK-NEXT: umov w9, v0.h[0]
-; CHECK-NEXT: umov w10, v0.h[2]
-; CHECK-NEXT: umov w11, v0.h[3]
-; CHECK-NEXT: orr w8, w9, w8
-; CHECK-NEXT: orr w9, w10, w11
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: orr x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
;
@@ -561,10 +524,9 @@ define i16 @test_redor_v16i16(<16 x i16> %a) {
define i32 @test_redor_v2i32(<2 x i32> %a) {
; CHECK-LABEL: test_redor_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: orr w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redor_v2i32:
@@ -584,9 +546,9 @@ define i32 @test_redor_v4i32(<4 x i32> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: orr w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redor_v4i32:
@@ -608,9 +570,9 @@ define i32 @test_redor_v8i32(<8 x i32> %a) {
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: orr w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: orr w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redor_v8i32:
diff --git a/llvm/test/CodeGen/AArch64/reduce-xor.ll b/llvm/test/CodeGen/AArch64/reduce-xor.ll
index 7bed71dffb93..494399f80857 100644
--- a/llvm/test/CodeGen/AArch64/reduce-xor.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-xor.ll
@@ -245,8 +245,16 @@ define i8 @test_redxor_v1i8(<1 x i8> %a) {
define i8 @test_redxor_v3i8(<3 x i8> %a) {
; CHECK-LABEL: test_redxor_v3i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor w8, w0, w1
-; CHECK-NEXT: eor w0, w8, w2
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: mov v0.h[0], w0
+; CHECK-NEXT: mov v0.h[1], w1
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: mov v0.h[2], w2
+; CHECK-NEXT: fmov x9, d0
+; CHECK-NEXT: lsr x10, x9, #32
+; CHECK-NEXT: lsr x9, x9, #16
+; CHECK-NEXT: eor w8, w8, w10
+; CHECK-NEXT: eor w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redxor_v3i8:
@@ -261,14 +269,10 @@ define i8 @test_redxor_v3i8(<3 x i8> %a) {
define i8 @test_redxor_v4i8(<4 x i8> %a) {
; CHECK-LABEL: test_redxor_v4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.h[3]
-; CHECK-NEXT: umov w9, v0.h[2]
-; CHECK-NEXT: umov w10, v0.h[1]
-; CHECK-NEXT: umov w11, v0.h[0]
-; CHECK-NEXT: eor w8, w9, w8
-; CHECK-NEXT: eor w10, w11, w10
-; CHECK-NEXT: eor w0, w10, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: eor x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
+; CHECK-NEXT: eor w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redxor_v4i8:
@@ -292,22 +296,11 @@ define i8 @test_redxor_v4i8(<4 x i8> %a) {
define i8 @test_redxor_v8i8(<8 x i8> %a) {
; CHECK-LABEL: test_redxor_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.b[5]
-; CHECK-NEXT: umov w9, v0.b[4]
-; CHECK-NEXT: umov w10, v0.b[1]
-; CHECK-NEXT: umov w11, v0.b[0]
-; CHECK-NEXT: umov w12, v0.b[3]
-; CHECK-NEXT: umov w13, v0.b[2]
-; CHECK-NEXT: umov w14, v0.b[6]
-; CHECK-NEXT: umov w15, v0.b[7]
-; CHECK-NEXT: eor w8, w9, w8
-; CHECK-NEXT: eor w10, w11, w10
-; CHECK-NEXT: eor w11, w13, w12
-; CHECK-NEXT: eor w9, w10, w11
-; CHECK-NEXT: eor w8, w8, w14
-; CHECK-NEXT: eor w8, w9, w8
-; CHECK-NEXT: eor w0, w8, w15
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: eor x8, x8, x8, lsr #32
+; CHECK-NEXT: eor x8, x8, x8, lsr #16
+; CHECK-NEXT: lsr x9, x8, #8
+; CHECK-NEXT: eor w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redxor_v8i8:
@@ -345,20 +338,10 @@ define i8 @test_redxor_v16i8(<16 x i8> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w8, v0.b[1]
-; CHECK-NEXT: umov w9, v0.b[0]
-; CHECK-NEXT: umov w10, v0.b[2]
-; CHECK-NEXT: umov w11, v0.b[3]
-; CHECK-NEXT: umov w12, v0.b[4]
-; CHECK-NEXT: umov w13, v0.b[5]
-; CHECK-NEXT: umov w14, v0.b[6]
-; CHECK-NEXT: eor w8, w9, w8
-; CHECK-NEXT: umov w9, v0.b[7]
-; CHECK-NEXT: eor w10, w10, w11
-; CHECK-NEXT: eor w11, w12, w13
-; CHECK-NEXT: eor w8, w8, w10
-; CHECK-NEXT: eor w10, w11, w14
-; CHECK-NEXT: eor w8, w8, w10
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: eor x8, x8, x8, lsr #32
+; CHECK-NEXT: eor x8, x8, x8, lsr #16
+; CHECK-NEXT: lsr x9, x8, #8
; CHECK-NEXT: eor w0, w8, w9
; CHECK-NEXT: ret
;
@@ -399,20 +382,10 @@ define i8 @test_redxor_v32i8(<32 x i8> %a) {
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w8, v0.b[1]
-; CHECK-NEXT: umov w9, v0.b[0]
-; CHECK-NEXT: umov w10, v0.b[2]
-; CHECK-NEXT: umov w11, v0.b[3]
-; CHECK-NEXT: umov w12, v0.b[4]
-; CHECK-NEXT: umov w13, v0.b[5]
-; CHECK-NEXT: umov w14, v0.b[6]
-; CHECK-NEXT: eor w8, w9, w8
-; CHECK-NEXT: umov w9, v0.b[7]
-; CHECK-NEXT: eor w10, w10, w11
-; CHECK-NEXT: eor w11, w12, w13
-; CHECK-NEXT: eor w8, w8, w10
-; CHECK-NEXT: eor w10, w11, w14
-; CHECK-NEXT: eor w8, w8, w10
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: eor x8, x8, x8, lsr #32
+; CHECK-NEXT: eor x8, x8, x8, lsr #16
+; CHECK-NEXT: lsr x9, x8, #8
; CHECK-NEXT: eor w0, w8, w9
; CHECK-NEXT: ret
;
@@ -451,14 +424,10 @@ define i8 @test_redxor_v32i8(<32 x i8> %a) {
define i16 @test_redxor_v4i16(<4 x i16> %a) {
; CHECK-LABEL: test_redxor_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.h[3]
-; CHECK-NEXT: umov w9, v0.h[2]
-; CHECK-NEXT: umov w10, v0.h[1]
-; CHECK-NEXT: umov w11, v0.h[0]
-; CHECK-NEXT: eor w8, w9, w8
-; CHECK-NEXT: eor w10, w11, w10
-; CHECK-NEXT: eor w0, w10, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: eor x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
+; CHECK-NEXT: eor w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redxor_v4i16:
@@ -484,12 +453,9 @@ define i16 @test_redxor_v8i16(<8 x i16> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w8, v0.h[1]
-; CHECK-NEXT: umov w9, v0.h[0]
-; CHECK-NEXT: umov w10, v0.h[2]
-; CHECK-NEXT: umov w11, v0.h[3]
-; CHECK-NEXT: eor w8, w9, w8
-; CHECK-NEXT: eor w9, w10, w11
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: eor x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
; CHECK-NEXT: eor w0, w8, w9
; CHECK-NEXT: ret
;
@@ -518,12 +484,9 @@ define i16 @test_redxor_v16i16(<16 x i16> %a) {
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w8, v0.h[1]
-; CHECK-NEXT: umov w9, v0.h[0]
-; CHECK-NEXT: umov w10, v0.h[2]
-; CHECK-NEXT: umov w11, v0.h[3]
-; CHECK-NEXT: eor w8, w9, w8
-; CHECK-NEXT: eor w9, w10, w11
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: eor x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
; CHECK-NEXT: eor w0, w8, w9
; CHECK-NEXT: ret
;
@@ -550,10 +513,9 @@ define i16 @test_redxor_v16i16(<16 x i16> %a) {
define i32 @test_redxor_v2i32(<2 x i32> %a) {
; CHECK-LABEL: test_redxor_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: eor w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: eor w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redxor_v2i32:
@@ -573,9 +535,9 @@ define i32 @test_redxor_v4i32(<4 x i32> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: eor w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: eor w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redxor_v4i32:
@@ -597,9 +559,9 @@ define i32 @test_redxor_v8i32(<8 x i32> %a) {
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: eor w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: eor w0, w8, w9
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redxor_v8i32:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
index ed9b73c75314..61dbcbdb475d 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll
@@ -51,7 +51,7 @@ define i8 @andv_v32i8(ptr %a) vscale_range(2,0) #0 {
define i8 @andv_v64i8(ptr %a) #0 {
; VBITS_GE_256-LABEL: andv_v64i8:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
@@ -140,7 +140,7 @@ define i16 @andv_v16i16(ptr %a) vscale_range(2,0) #0 {
define i16 @andv_v32i16(ptr %a) #0 {
; VBITS_GE_256-LABEL: andv_v32i16:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
@@ -229,7 +229,7 @@ define i32 @andv_v8i32(ptr %a) vscale_range(2,0) #0 {
define i32 @andv_v16i32(ptr %a) #0 {
; VBITS_GE_256-LABEL: andv_v16i32:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
@@ -316,7 +316,7 @@ define i64 @andv_v4i64(ptr %a) vscale_range(2,0) #0 {
define i64 @andv_v8i64(ptr %a) #0 {
; VBITS_GE_256-LABEL: andv_v8i64:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
@@ -409,7 +409,7 @@ define i8 @eorv_v32i8(ptr %a) vscale_range(2,0) #0 {
define i8 @eorv_v64i8(ptr %a) #0 {
; VBITS_GE_256-LABEL: eorv_v64i8:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
@@ -498,7 +498,7 @@ define i16 @eorv_v16i16(ptr %a) vscale_range(2,0) #0 {
define i16 @eorv_v32i16(ptr %a) #0 {
; VBITS_GE_256-LABEL: eorv_v32i16:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
@@ -587,7 +587,7 @@ define i32 @eorv_v8i32(ptr %a) vscale_range(2,0) #0 {
define i32 @eorv_v16i32(ptr %a) #0 {
; VBITS_GE_256-LABEL: eorv_v16i32:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
@@ -674,7 +674,7 @@ define i64 @eorv_v4i64(ptr %a) vscale_range(2,0) #0 {
define i64 @eorv_v8i64(ptr %a) #0 {
; VBITS_GE_256-LABEL: eorv_v8i64:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
@@ -767,7 +767,7 @@ define i8 @orv_v32i8(ptr %a) vscale_range(2,0) #0 {
define i8 @orv_v64i8(ptr %a) #0 {
; VBITS_GE_256-LABEL: orv_v64i8:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov w8, #32
+; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
@@ -856,7 +856,7 @@ define i16 @orv_v16i16(ptr %a) vscale_range(2,0) #0 {
define i16 @orv_v32i16(ptr %a) #0 {
; VBITS_GE_256-LABEL: orv_v32i16:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #16
+; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
@@ -945,7 +945,7 @@ define i32 @orv_v8i32(ptr %a) vscale_range(2,0) #0 {
define i32 @orv_v16i32(ptr %a) #0 {
; VBITS_GE_256-LABEL: orv_v16i32:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #8
+; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
@@ -1032,7 +1032,7 @@ define i64 @orv_v4i64(ptr %a) vscale_range(2,0) #0 {
define i64 @orv_v8i64(ptr %a) #0 {
; VBITS_GE_256-LABEL: orv_v8i64:
; VBITS_GE_256: // %bb.0:
-; VBITS_GE_256-NEXT: mov x8, #4
+; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll
index 3c0407a82938..1304bb8bc696 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll
@@ -17,8 +17,7 @@ define i1 @ptest_v16i1_256bit_min_sve(ptr %a, ptr %b) vscale_range(2, 0) {
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b
; CHECK-NEXT: mov v1.d[1], v0.d[0]
-; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: orv b0, p0, z1.b
+; CHECK-NEXT: umaxv b0, v1.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
@@ -35,10 +34,9 @@ define i1 @ptest_v16i1_512bit_min_sve(ptr %a, ptr %b) vscale_range(4, 0) {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0
; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT: orv b0, p0, z0.b
+; CHECK-NEXT: umaxv b0, v0.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
@@ -55,10 +53,9 @@ define i1 @ptest_v16i1_512bit_sve(ptr %a, ptr %b) vscale_range(4, 4) {
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0
; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT: orv b0, p0, z0.b
+; CHECK-NEXT: umaxv b0, v0.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
@@ -78,10 +75,9 @@ define i1 @ptest_or_v16i1_512bit_min_sve(ptr %a, ptr %b) vscale_range(4, 0) {
; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0
; CHECK-NEXT: mov p0.b, p1/m, p1.b
; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT: orv b0, p0, z0.b
+; CHECK-NEXT: umaxv b0, v0.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
@@ -109,10 +105,9 @@ define i1 @ptest_and_v16i1_512bit_sve(ptr %a, ptr %b) vscale_range(4, 4) {
; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0
; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0
; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT: andv b0, p0, z0.b
+; CHECK-NEXT: uminv b0, v0.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
@@ -135,10 +130,9 @@ define i1 @ptest_and_v16i1_512bit_min_sve(ptr %a, ptr %b) vscale_range(4, 0) {
; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT: andv b0, p0, z0.b
+; CHECK-NEXT: uminv b0, v0.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
index ca5c1e25781d..0841fbe1763c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
@@ -29,7 +29,7 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) #0 {
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b
; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: orv b0, p0, z1.b
+; CHECK-NEXT: umaxv b0, p0, z1.b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
@@ -86,7 +86,7 @@ define i1 @ptest_or_v16i1(ptr %a, ptr %b) #0 {
; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b
; CHECK-NEXT: orr z0.d, z1.d, z3.d
; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: orv b0, p0, z0.b
+; CHECK-NEXT: umaxv b0, p0, z0.b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
@@ -153,7 +153,7 @@ define i1 @ptest_and_v16i1(ptr %a, ptr %b) #0 {
; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b
; CHECK-NEXT: and z0.d, z1.d, z3.d
; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: andv b0, p0, z0.b
+; CHECK-NEXT: uminv b0, p0, z0.b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
index e97ace9c3858..1b0706355679 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
@@ -85,9 +85,14 @@ define i128 @test_v1i128(<1 x i128> %a) nounwind {
define i8 @test_v3i8(<3 x i8> %a) nounwind {
; CHECK-LABEL: test_v3i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, w1
-; CHECK-NEXT: and w8, w8, w2
-; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: movi d0, #0xff00ff00ff00ff
+; CHECK-NEXT: mov v0.h[0], w0
+; CHECK-NEXT: mov v0.h[1], w1
+; CHECK-NEXT: mov v0.h[2], w2
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x8, x8, x8, lsr #32
+; CHECK-NEXT: lsr x9, x8, #16
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
%b = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> %a)
ret i8 %b
@@ -97,28 +102,21 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
; CHECK-LABEL: test_v9i8:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #-1 // =0xffffffff
-; CHECK-NEXT: umov w14, v0.b[6]
; CHECK-NEXT: mov v1.16b, v0.16b
; CHECK-NEXT: mov v1.b[9], w8
; CHECK-NEXT: mov v1.b[10], w8
; CHECK-NEXT: mov v1.b[11], w8
+; CHECK-NEXT: mov v1.b[12], w8
; CHECK-NEXT: mov v1.b[13], w8
-; CHECK-NEXT: umov w8, v0.b[4]
+; CHECK-NEXT: mov v1.b[14], w8
+; CHECK-NEXT: mov v1.b[15], w8
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: and v1.8b, v0.8b, v1.8b
-; CHECK-NEXT: umov w9, v1.b[1]
-; CHECK-NEXT: umov w10, v1.b[0]
-; CHECK-NEXT: umov w11, v1.b[2]
-; CHECK-NEXT: umov w12, v1.b[3]
-; CHECK-NEXT: umov w13, v1.b[5]
-; CHECK-NEXT: and w9, w10, w9
-; CHECK-NEXT: umov w10, v0.b[7]
-; CHECK-NEXT: and w11, w11, w12
-; CHECK-NEXT: and w8, w8, w13
-; CHECK-NEXT: and w9, w9, w11
-; CHECK-NEXT: and w8, w8, w14
-; CHECK-NEXT: and w8, w9, w8
-; CHECK-NEXT: and w0, w8, w10
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x8, x8, x8, lsr #32
+; CHECK-NEXT: and x8, x8, x8, lsr #16
+; CHECK-NEXT: lsr x9, x8, #8
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
%b = call i8 @llvm.vector.reduce.and.v9i8(<9 x i8> %a)
ret i8 %b
@@ -128,9 +126,10 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind {
; CHECK-LABEL: test_v3i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x8, x8, #32
+; CHECK-NEXT: and v1.8b, v0.8b, v1.8b
+; CHECK-NEXT: fmov x9, d1
; CHECK-NEXT: and w0, w9, w8
; CHECK-NEXT: ret
%b = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a)
@@ -155,9 +154,9 @@ define i24 @test_v4i24(<4 x i24> %a) nounwind {
; CHECK: // %bb.0:
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: and w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
%b = call i24 @llvm.vector.reduce.and.v4i24(<4 x i24> %a)
ret i24 %b
@@ -181,9 +180,9 @@ define i32 @test_v16i32(<16 x i32> %a) nounwind {
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: and w0, w9, w8
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: lsr x9, x8, #32
+; CHECK-NEXT: and w0, w8, w9
; CHECK-NEXT: ret
%b = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a)
ret i32 %b
More information about the llvm-commits
mailing list