[llvm] d4a6bf4 - Revert "[AArch64][SVE][VLS] Move extends into arguments of comparisons"
Peter Waller via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 20 04:02:10 PST 2022
Author: Peter Waller
Date: 2022-01-20T12:01:23Z
New Revision: d4a6bf4d1a915d3b0566f38e13ea90b597f6cfe3
URL: https://github.com/llvm/llvm-project/commit/d4a6bf4d1a915d3b0566f38e13ea90b597f6cfe3
DIFF: https://github.com/llvm/llvm-project/commit/d4a6bf4d1a915d3b0566f38e13ea90b597f6cfe3.diff
LOG: Revert "[AArch64][SVE][VLS] Move extends into arguments of comparisons"
This reverts commit db04d3e30b3878ae39ef64eb0b0a1538644c7f6a, which
causes a buildbot failure.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 450ff60bad8b..8f85c93e1d5f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15338,40 +15338,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
return SDValue();
}
-static bool isCheapToExtend(const SDValue &N) {
- unsigned OC = N->getOpcode();
- return OC == ISD::LOAD || OC == ISD::MLOAD ||
- ISD::isConstantSplatVectorAllZeros(N.getNode());
-}
-
-static SDValue
-performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
- // If we have (sext (setcc A B)) and A and B are cheap to extend,
- // we can move the sext into the arguments and have the same result. For
- // example, if A and B are both loads, we can make those extending loads and
- // avoid an extra instruction. This pattern appears often in VLS code
- // generation where the inputs to the setcc have a
diff erent size to the
- // instruction that wants to use the result of the setcc.
- assert(N->getOpcode() == ISD::SIGN_EXTEND &&
- N->getOperand(0)->getOpcode() == ISD::SETCC);
- const SDValue SetCC = N->getOperand(0);
-
- if (isCheapToExtend(SetCC.getOperand(0)) &&
- isCheapToExtend(SetCC.getOperand(1))) {
- const SDValue Ext1 = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N),
- N->getValueType(0), SetCC.getOperand(0));
- const SDValue Ext2 = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N),
- N->getValueType(0), SetCC.getOperand(1));
-
- return DAG.getSetCC(
- SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
- cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
- }
-
- return SDValue();
-}
-
static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -15390,11 +15356,6 @@ static SDValue performExtendCombine(SDNode *N,
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
}
-
- if (N->getOpcode() == ISD::SIGN_EXTEND &&
- N->getOperand(0)->getOpcode() == ISD::SETCC)
- return performSignExtendSetCCCombine(N, DCI, DAG);
-
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
index 6bb85a815389..d3ad7df60e66 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -265,10 +265,12 @@ define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>*
define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16:
; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl32
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p0.b, p0/z, z0.b, #0
+; VBITS_GE_512-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
-; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
-; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0]
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8]
; VBITS_GE_512-NEXT: ret
%b = load <32 x i8>, <32 x i8>* %bp
@@ -281,9 +283,12 @@ define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0
define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 {
; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32:
; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ldr q0, [x1]
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: ld1sb { z0.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
+; VBITS_GE_512-NEXT: cmeq v0.16b, v0.16b, #0
+; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: cmpne p1.s, p0/z, z0.s, #0
; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
@@ -297,9 +302,13 @@ define <16 x i32> @masked_load_sext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0
define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64:
; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ldr d0, [x1]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ld1sb { z0.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_512-NEXT: cmeq v0.8b, v0.8b, #0
+; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0]
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_512-NEXT: ret
@@ -313,10 +322,12 @@ define <8 x i64> @masked_load_sext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 {
; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32:
; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, #0
+; VBITS_GE_512-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
%b = load <16 x i16>, <16 x i16>* %bp
@@ -329,9 +340,12 @@ define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp)
define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64:
; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ldr q0, [x1]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_512-NEXT: cmeq v0.8h, v0.8h, #0
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0]
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_512-NEXT: ret
@@ -345,10 +359,12 @@ define <8 x i64> @masked_load_sext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64:
; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0
+; VBITS_GE_512-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0]
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i32>, <8 x i32>* %bp
@@ -361,10 +377,12 @@ define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16:
; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.b, vl32
+; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p0.b, p0/z, z0.b, #0
+; VBITS_GE_512-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.h, vl32
-; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0
-; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0]
; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x8]
; VBITS_GE_512-NEXT: ret
%b = load <32 x i8>, <32 x i8>* %bp
@@ -377,9 +395,12 @@ define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0
define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0 {
; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32:
; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ldr q0, [x1]
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: ld1sb { z0.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
+; VBITS_GE_512-NEXT: cmeq v0.16b, v0.16b, #0
+; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: cmpne p1.s, p0/z, z0.s, #0
; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
@@ -393,9 +414,13 @@ define <16 x i32> @masked_load_zext_v16i8i32(<16 x i8>* %ap, <16 x i8>* %bp) #0
define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64:
; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ldr d0, [x1]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ld1sb { z0.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_512-NEXT: cmeq v0.8b, v0.8b, #0
+; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0]
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_512-NEXT: ret
@@ -409,10 +434,12 @@ define <8 x i64> @masked_load_zext_v8i8i64(<8 x i8>* %ap, <8 x i8>* %bp) #0 {
define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) #0 {
; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32:
; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.h, vl16
+; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, #0
+; VBITS_GE_512-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.s, vl16
-; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0]
; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_512-NEXT: ret
%b = load <16 x i16>, <16 x i16>* %bp
@@ -425,9 +452,12 @@ define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp)
define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64:
; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ldr q0, [x1]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ld1sh { z0.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_512-NEXT: cmeq v0.8h, v0.8h, #0
+; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h
+; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s
+; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0
; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0]
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_512-NEXT: ret
@@ -441,10 +471,12 @@ define <8 x i64> @masked_load_zext_v8i16i64(<8 x i16>* %ap, <8 x i16>* %bp) #0 {
define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64:
; VBITS_GE_512: // %bb.0:
+; VBITS_GE_512-NEXT: ptrue p0.s, vl8
+; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1]
+; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0
+; VBITS_GE_512-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ptrue p0.d, vl8
-; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0]
; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_512-NEXT: ret
%b = load <8 x i32>, <8 x i32>* %bp
@@ -649,10 +681,12 @@ define <8 x i64> @masked_load_zext_v8i32i64_m64(<8 x i32>* %ap, <8 x i64>* %bp)
define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 {
; VBITS_GE_2048-LABEL: masked_load_sext_v128i8i16:
; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.b, vl128
+; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1]
+; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.h, p0/z, z0.h, #0
-; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p1/z, [x0]
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%b = load <128 x i8>, <128 x i8>* %bp
@@ -665,10 +699,13 @@ define <128 x i16> @masked_load_sext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp)
define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
; VBITS_GE_2048-LABEL: masked_load_sext_v64i8i32:
; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.b, vl64
+; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1]
+; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p1/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%b = load <64 x i8>, <64 x i8>* %bp
@@ -681,10 +718,14 @@ define <64 x i32> @masked_load_sext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0
define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
; VBITS_GE_2048-LABEL: masked_load_sext_v32i8i64:
; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.b, vl32
+; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1]
+; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p1/z, [x0]
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%b = load <32 x i8>, <32 x i8>* %bp
@@ -697,10 +738,12 @@ define <32 x i64> @masked_load_sext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0
define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 {
; VBITS_GE_2048-LABEL: masked_load_sext_v64i16i32:
; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
+; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1]
+; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p1/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%b = load <64 x i16>, <64 x i16>* %bp
@@ -713,10 +756,13 @@ define <64 x i32> @masked_load_sext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp)
define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
; VBITS_GE_2048-LABEL: masked_load_sext_v32i16i64:
; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
+; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1]
+; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p1/z, [x0]
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%b = load <32 x i16>, <32 x i16>* %bp
@@ -729,10 +775,12 @@ define <32 x i64> @masked_load_sext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp)
define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 {
; VBITS_GE_2048-LABEL: masked_load_sext_v32i32i64:
; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
+; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x1]
+; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p1/z, [x0]
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%b = load <32 x i32>, <32 x i32>* %bp
@@ -745,10 +793,12 @@ define <32 x i64> @masked_load_sext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp)
define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp) #0 {
; VBITS_GE_2048-LABEL: masked_load_zext_v128i8i16:
; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.b, vl128
+; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1]
+; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: ld1b { z0.h }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.h, vl128
-; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.h, p0/z, z0.h, #0
-; VBITS_GE_2048-NEXT: ld1b { z0.h }, p1/z, [x0]
; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%b = load <128 x i8>, <128 x i8>* %bp
@@ -761,10 +811,13 @@ define <128 x i16> @masked_load_zext_v128i8i16(<128 x i8>* %ap, <128 x i8>* %bp)
define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
; VBITS_GE_2048-LABEL: masked_load_zext_v64i8i32:
; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.b, vl64
+; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1]
+; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: ld1b { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1sb { z0.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT: ld1b { z0.s }, p1/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%b = load <64 x i8>, <64 x i8>* %bp
@@ -777,10 +830,14 @@ define <64 x i32> @masked_load_zext_v64i8i32(<64 x i8>* %ap, <64 x i8>* %bp) #0
define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0 {
; VBITS_GE_2048-LABEL: masked_load_zext_v32i8i64:
; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.b, vl32
+; VBITS_GE_2048-NEXT: ld1b { z0.b }, p0/z, [x1]
+; VBITS_GE_2048-NEXT: cmpeq p0.b, p0/z, z0.b, #0
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1sb { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1b { z0.d }, p1/z, [x0]
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%b = load <32 x i8>, <32 x i8>* %bp
@@ -793,10 +850,12 @@ define <32 x i64> @masked_load_zext_v32i8i64(<32 x i8>* %ap, <32 x i8>* %bp) #0
define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp) #0 {
; VBITS_GE_2048-LABEL: masked_load_zext_v64i16i32:
; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.h, vl64
+; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1]
+; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.s, vl64
-; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_2048-NEXT: ld1h { z0.s }, p1/z, [x0]
; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%b = load <64 x i16>, <64 x i16>* %bp
@@ -809,10 +868,13 @@ define <64 x i32> @masked_load_zext_v64i16i32(<64 x i16>* %ap, <64 x i16>* %bp)
define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
; VBITS_GE_2048-LABEL: masked_load_zext_v32i16i64:
; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.h, vl32
+; VBITS_GE_2048-NEXT: ld1h { z0.h }, p0/z, [x1]
+; VBITS_GE_2048-NEXT: cmpeq p0.h, p0/z, z0.h, #0
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1sh { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x0]
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%b = load <32 x i16>, <32 x i16>* %bp
@@ -825,10 +887,12 @@ define <32 x i64> @masked_load_zext_v32i16i64(<32 x i16>* %ap, <32 x i16>* %bp)
define <32 x i64> @masked_load_zext_v32i32i64(<32 x i32>* %ap, <32 x i32>* %bp) #0 {
; VBITS_GE_2048-LABEL: masked_load_zext_v32i32i64:
; VBITS_GE_2048: // %bb.0:
+; VBITS_GE_2048-NEXT: ptrue p0.s, vl32
+; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x1]
+; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0
+; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b
+; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0]
; VBITS_GE_2048-NEXT: ptrue p0.d, vl32
-; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x1]
-; VBITS_GE_2048-NEXT: cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x0]
; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x8]
; VBITS_GE_2048-NEXT: ret
%b = load <32 x i32>, <32 x i32>* %bp
More information about the llvm-commits
mailing list