[llvm] c089e29 - [AArch64][SVE] DAG combine SETCC_MERGE_ZERO of a SETCC_MERGE_ZERO
Bradley Smith via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 28 07:06:26 PDT 2021
Author: Bradley Smith
Date: 2021-06-28T15:06:06+01:00
New Revision: c089e29aa47f8833d4370ac1a87a17f7b3a585cf
URL: https://github.com/llvm/llvm-project/commit/c089e29aa47f8833d4370ac1a87a17f7b3a585cf
DIFF: https://github.com/llvm/llvm-project/commit/c089e29aa47f8833d4370ac1a87a17f7b3a585cf.diff
LOG: [AArch64][SVE] DAG combine SETCC_MERGE_ZERO of a SETCC_MERGE_ZERO
This helps remove extra comparisons when generating masks for fixed
length masked operations.
Differential Revision: https://reviews.llvm.org/D104910
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9886d6374665..16bb7eb22272 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15508,6 +15508,27 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
+ "Unexpected opcode!");
+
+ SDValue Pred = N->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
+
+ // setcc_merge_zero pred (sign_extend (setcc_merge_zero ... pred ...)), 0, ne
+ // => inner setcc_merge_zero
+ if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
+ LHS->getOpcode() == ISD::SIGN_EXTEND &&
+ LHS->getOperand(0)->getValueType(0) == N->getValueType(0) &&
+ LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
+ LHS->getOperand(0)->getOperand(0) == Pred)
+ return LHS->getOperand(0);
+
+ return SDValue();
+}
+
// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
// as well as whether the test should be inverted. This code is required to
// catch these cases (as opposed to standard dag combines) because
@@ -16366,6 +16387,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performSpliceCombine(N, DAG);
case AArch64ISD::UZP1:
return performUzpCombine(N, DAG);
+ case AArch64ISD::SETCC_MERGE_ZERO:
+ return performSetccMergeZeroCombine(N, DAG);
case AArch64ISD::GLD1_MERGE_ZERO:
case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
index ecc2ca518df1..d8c040e3fba2 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -90,8 +90,6 @@ define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
; CHECK-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
; CHECK-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
; CHECK-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; CHECK-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; CHECK-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
; CHECK-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x0]
; CHECK-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
; CHECK-NEXT: ret
@@ -108,8 +106,6 @@ define <16 x float> @masked_load_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0
; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
; VBITS_GE_512-NEXT: ret
@@ -126,8 +122,6 @@ define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0
; VBITS_GE_1024-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
; VBITS_GE_1024-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_1024-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; VBITS_GE_1024-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
; VBITS_GE_1024-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
; VBITS_GE_1024-NEXT: ret
@@ -144,8 +138,6 @@ define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0
; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_2048-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; VBITS_GE_2048-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
; VBITS_GE_2048-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
; VBITS_GE_2048-NEXT: ret
@@ -163,8 +155,6 @@ define <64 x i8> @masked_load_v64i8(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b
-; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].b, [[PG0]]/z, [[Z0]].b, #0
; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: st1b { [[Z0]].b }, [[PG0]], [x8]
; VBITS_GE_512-NEXT: ret
@@ -181,8 +171,6 @@ define <32 x i16> @masked_load_v32i16(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h
-; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].h, [[PG0]]/z, [[Z0]].h, #0
; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG0]], [x8]
; VBITS_GE_512: ret
@@ -199,8 +187,6 @@ define <16 x i32> @masked_load_v16i32(<16 x i32>* %ap, <16 x i32>* %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
; VBITS_GE_512-NEXT: ret
@@ -217,8 +203,6 @@ define <8 x i64> @masked_load_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 {
; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d
-; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0
; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG0]], [x8]
; VBITS_GE_512-NEXT: ret
@@ -235,8 +219,6 @@ define <8 x i64> @masked_load_passthru_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0
; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d
-; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0
; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: sel [[Z2:z[0-9]+]].d, [[PG1]], [[Z0]].d, [[Z1]].d
; VBITS_GE_512-NEXT: st1d { [[Z2]].d }, [[PG0]], [x8]
@@ -254,8 +236,6 @@ define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>*
; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d
-; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0
; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: sel [[Z2:z[0-9]+]].d, [[PG1]], [[Z0]].d, [[Z1]].d
; VBITS_GE_512-NEXT: st1d { [[Z2]].d }, [[PG0]], [x8]
@@ -273,12 +253,10 @@ define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0
; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b
-; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0
-; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32
; VBITS_GE_512-NEXT: sunpklo [[Z0]].h, [[Z0]].b
-; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG2]], [x8]
+; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG1]], [x8]
; VBITS_GE_512-NEXT: ret
%a = load <32 x i8>, <32 x i8>* %ap
%b = load <32 x i8>, <32 x i8>* %bp
@@ -337,12 +315,10 @@ define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp)
; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h
-; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, #0
-; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG2]]/z, [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
; VBITS_GE_512-NEXT: sunpklo [[Z0]].s, [[Z0]].h
-; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8]
+; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG1]], [x8]
; VBITS_GE_512-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %ap
%b = load <16 x i16>, <16 x i16>* %bp
@@ -379,12 +355,10 @@ define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, #0
-; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG2]]/z, [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: sunpklo [[Z0]].d, [[Z0]].s
-; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
+; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG1]], [x8]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %ap
%b = load <8 x i32>, <8 x i32>* %bp
@@ -400,12 +374,10 @@ define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0
; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b
-; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0
-; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32
; VBITS_GE_512-NEXT: uunpklo [[Z0]].h, [[Z0]].b
-; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG2]], [x8]
+; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG1]], [x8]
; VBITS_GE_512-NEXT: ret
%a = load <32 x i8>, <32 x i8>* %ap
%b = load <32 x i8>, <32 x i8>* %bp
@@ -464,12 +436,10 @@ define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp)
; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h
-; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, #0
-; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG2]]/z, [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
; VBITS_GE_512-NEXT: uunpklo [[Z0]].s, [[Z0]].h
-; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8]
+; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG1]], [x8]
; VBITS_GE_512-NEXT: ret
%a = load <16 x i16>, <16 x i16>* %ap
%b = load <16 x i16>, <16 x i16>* %bp
@@ -506,12 +476,10 @@ define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, #0
-; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG2]]/z, [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: uunpklo [[Z0]].d, [[Z0]].s
-; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
+; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG1]], [x8]
; VBITS_GE_512-NEXT: ret
%a = load <8 x i32>, <8 x i32>* %ap
%b = load <8 x i32>, <8 x i32>* %bp
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
index bbba4336e0e6..6f5c5cee303c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
@@ -91,9 +91,7 @@ define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
; CHECK-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0]
; CHECK-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1]
; CHECK-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; CHECK-NEXT: mov [[Z2:z[0-9]+]].s, [[PG1]]/z, #-1
-; CHECK-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z2]].s, #0
-; CHECK-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}]
+; CHECK-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}]
; CHECK-NEXT: ret
%a = load <8 x float>, <8 x float>* %ap
%b = load <8 x float>, <8 x float>* %bp
@@ -108,9 +106,7 @@ define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 {
; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1]
; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_512-NEXT: mov [[Z2:z[0-9]+]].s, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0
-; VBITS_GE_512-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}]
; VBITS_GE_512-NEXT: ret
%a = load <16 x float>, <16 x float>* %ap
%b = load <16 x float>, <16 x float>* %bp
@@ -125,9 +121,7 @@ define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
; VBITS_GE_1024-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_1024-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1]
; VBITS_GE_1024-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_1024-NEXT: mov [[Z1:z[0-9]+]].s, [[PG1]]/z, #-1
-; VBITS_GE_1024-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0
-; VBITS_GE_1024-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}]
+; VBITS_GE_1024-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}]
; VBITS_GE_1024-NEXT: ret
%a = load <32 x float>, <32 x float>* %ap
%b = load <32 x float>, <32 x float>* %bp
@@ -142,9 +136,7 @@ define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 {
; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1]
; VBITS_GE_2048-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_2048-NEXT: mov [[Z1:z[0-9]+]].s, [[PG1]]/z, #-1
-; VBITS_GE_2048-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0
-; VBITS_GE_2048-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}]
+; VBITS_GE_2048-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}]
; VBITS_GE_2048-NEXT: ret
%a = load <64 x float>, <64 x float>* %ap
%b = load <64 x float>, <64 x float>* %bp
More information about the llvm-commits
mailing list