[llvm] afdedd4 - [AArch64] Try to re-use extended operand for SETCC with vector ops.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 7 16:51:14 PDT 2022
Author: Florian Hahn
Date: 2022-07-07T16:50:00-07:00
New Revision: afdedd405e493dc80bd5ceb9133b9d3a8bc69f2c
URL: https://github.com/llvm/llvm-project/commit/afdedd405e493dc80bd5ceb9133b9d3a8bc69f2c
DIFF: https://github.com/llvm/llvm-project/commit/afdedd405e493dc80bd5ceb9133b9d3a8bc69f2c.diff
LOG: [AArch64] Try to re-use extended operand for SETCC with vector ops.
Try to re-use an already extended operand for SetCC with vector operands
feeding an extended select. Doing so avoids requiring another full
extension of the SET_CC result when lowering the select.
This improves lowering for certain extend/cmp/select patterns operating.
For example with v16i8, this replaces 6 instructions for the extra extension
with 4 separate selects.
This improves the generated code for loops like the one below in
combination with D96522.
int foo(uint8_t *p, int N) {
unsigned long long sum = 0;
for (int i = 0; i < N ; i++, p++) {
unsigned int v = *p;
sum += (v < 127) ? v : 256 - v;
}
return sum;
}
https://clang.godbolt.org/z/Wco866MjY
On the AArch64 cores I have access to, the patch improves performance of
the vector loop by ~10%.
This could be generalized per follow-ups, but the initial version
targets one of the more important cases in combination with D96522.
Alive2 modeling:
* sext EQ https://alive2.llvm.org/ce/z/5upBvb
* sext NE https://alive2.llvm.org/ce/z/zbEcJp
* zext EQ https://alive2.llvm.org/ce/z/_xMwof
* zext NE https://alive2.llvm.org/ce/z/5FwKfc
* zext unsigned predicate: https://alive2.llvm.org/ce/z/iEwLU3
* sext signed predicate: https://alive2.llvm.org/ce/z/aMBega
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D120481
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/vselect-ext.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8ac99cfc42954..ff030569eb3c1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18065,6 +18065,54 @@ static SDValue performCSELCombine(SDNode *N,
return performCONDCombine(N, DCI, DAG, 2, 3);
}
+// Try to re-use an already extended operand of a vector SetCC feeding a
+// extended select. Doing so avoids requiring another full extension of the
+// SET_CC result when lowering the select.
+static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
+ EVT Op0MVT = Op->getOperand(0).getValueType();
+ if (!Op0MVT.isVector() || Op->use_empty())
+ return SDValue();
+
+ // Make sure that all uses of Op are VSELECTs with result matching types where
+ // the result type has a larger element type than the SetCC operand.
+ SDNode *FirstUse = *Op->use_begin();
+ if (FirstUse->getOpcode() != ISD::VSELECT)
+ return SDValue();
+ EVT UseMVT = FirstUse->getValueType(0);
+ if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
+ return SDValue();
+ if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
+ return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
+ }))
+ return SDValue();
+
+ APInt V;
+ if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
+ return SDValue();
+
+ SDLoc DL(Op);
+ SDValue Op0ExtV;
+ SDValue Op1ExtV;
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
+ // Check if the first operand of the SET_CC is already extended. If it is,
+ // split the SET_CC and re-use the extended version of the operand.
+ SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
+ Op->getOperand(0));
+ SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
+ Op->getOperand(0));
+ if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
+ Op0ExtV = SDValue(Op0SExt, 0);
+ Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
+ } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
+ Op0ExtV = SDValue(Op0ZExt, 0);
+ Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
+ } else
+ return SDValue();
+
+ return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
+ Op0ExtV, Op1ExtV, Op->getOperand(2));
+}
+
static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
SDValue LHS = N->getOperand(0);
@@ -18073,6 +18121,9 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
+ if (SDValue V = tryToWidenSetCCOperands(N, DAG))
+ return V;
+
// setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
if (Cond == ISD::SETNE && isOneConstant(RHS) &&
LHS->getOpcode() == AArch64ISD::CSEL &&
diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll
index b3066e5247901..fa0db82b7a9a4 100644
--- a/llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -85,26 +85,23 @@ entry:
define <8 x i64> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i64(<8 x i8> %a) {
; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v8i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.8b v1, #10
-; CHECK-NEXT: ushll.8h v2, v0, #0
-; CHECK-NEXT: cmhi.8b v0, v0, v1
-; CHECK-NEXT: ushll.4s v1, v2, #0
-; CHECK-NEXT: sshll.8h v0, v0, #0
-; CHECK-NEXT: ushll2.4s v2, v2, #0
-; CHECK-NEXT: sshll.4s v3, v0, #0
-; CHECK-NEXT: sshll2.4s v0, v0, #0
-; CHECK-NEXT: ushll.2d v4, v1, #0
-; CHECK-NEXT: ushll.2d v5, v2, #0
-; CHECK-NEXT: ushll2.2d v1, v1, #0
-; CHECK-NEXT: ushll2.2d v2, v2, #0
-; CHECK-NEXT: sshll.2d v6, v3, #0
-; CHECK-NEXT: sshll.2d v7, v0, #0
-; CHECK-NEXT: sshll2.2d v0, v0, #0
-; CHECK-NEXT: sshll2.2d v16, v3, #0
-; CHECK-NEXT: and.16b v3, v2, v0
-; CHECK-NEXT: and.16b v1, v1, v16
-; CHECK-NEXT: and.16b v2, v5, v7
-; CHECK-NEXT: and.16b v0, v4, v6
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: mov w8, #10
+; CHECK-NEXT: ushll2.4s v2, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: dup.2d v1, x8
+; CHECK-NEXT: ushll2.2d v3, v2, #0
+; CHECK-NEXT: ushll2.2d v4, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ushll.2d v2, v2, #0
+; CHECK-NEXT: cmhi.2d v5, v0, v1
+; CHECK-NEXT: cmhi.2d v6, v2, v1
+; CHECK-NEXT: cmhi.2d v7, v3, v1
+; CHECK-NEXT: cmhi.2d v1, v4, v1
+; CHECK-NEXT: and.16b v3, v3, v7
+; CHECK-NEXT: and.16b v1, v4, v1
+; CHECK-NEXT: and.16b v2, v2, v6
+; CHECK-NEXT: and.16b v0, v0, v5
; CHECK-NEXT: ret
%ext = zext <8 x i8> %a to <8 x i64>
%cmp = icmp ugt <8 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -116,24 +113,21 @@ define <8 x i64> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i64(<8 x i8>
define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v16i32(<16 x i8> %a) {
; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v16i32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.16b v1, #10
-; CHECK-NEXT: ushll.8h v2, v0, #0
-; CHECK-NEXT: ushll2.8h v3, v0, #0
-; CHECK-NEXT: ushll.4s v4, v2, #0
-; CHECK-NEXT: cmhi.16b v0, v0, v1
-; CHECK-NEXT: ushll.4s v5, v3, #0
-; CHECK-NEXT: ushll2.4s v1, v3, #0
-; CHECK-NEXT: sshll.8h v3, v0, #0
-; CHECK-NEXT: sshll2.8h v0, v0, #0
-; CHECK-NEXT: ushll2.4s v2, v2, #0
-; CHECK-NEXT: sshll.4s v6, v3, #0
-; CHECK-NEXT: sshll.4s v7, v0, #0
-; CHECK-NEXT: sshll2.4s v0, v0, #0
-; CHECK-NEXT: sshll2.4s v16, v3, #0
-; CHECK-NEXT: and.16b v3, v1, v0
-; CHECK-NEXT: and.16b v1, v2, v16
-; CHECK-NEXT: and.16b v2, v5, v7
-; CHECK-NEXT: and.16b v0, v4, v6
+; CHECK-NEXT: movi.4s v1, #10
+; CHECK-NEXT: ushll2.8h v2, v0, #0
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll2.4s v3, v2, #0
+; CHECK-NEXT: ushll2.4s v4, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ushll.4s v2, v2, #0
+; CHECK-NEXT: cmhi.4s v5, v0, v1
+; CHECK-NEXT: cmhi.4s v6, v2, v1
+; CHECK-NEXT: cmhi.4s v7, v3, v1
+; CHECK-NEXT: cmhi.4s v1, v4, v1
+; CHECK-NEXT: and.16b v3, v3, v7
+; CHECK-NEXT: and.16b v1, v4, v1
+; CHECK-NEXT: and.16b v2, v2, v6
+; CHECK-NEXT: and.16b v0, v0, v5
; CHECK-NEXT: ret
%ext = zext <16 x i8> %a to <16 x i32>
%cmp = icmp ugt <16 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -144,16 +138,14 @@ define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v16i32(<16 x i
define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32(<8 x i8> %a) {
; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v8i32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.8b v1, #10
-; CHECK-NEXT: ushll.8h v2, v0, #0
-; CHECK-NEXT: ushll.4s v3, v2, #0
-; CHECK-NEXT: cmhi.8b v0, v0, v1
-; CHECK-NEXT: ushll2.4s v1, v2, #0
-; CHECK-NEXT: sshll.8h v0, v0, #0
-; CHECK-NEXT: sshll2.4s v2, v0, #0
-; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: and.16b v1, v1, v2
-; CHECK-NEXT: and.16b v0, v3, v0
+; CHECK-NEXT: movi.4s v1, #10
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll2.4s v2, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: cmhi.4s v3, v2, v1
+; CHECK-NEXT: cmhi.4s v4, v0, v1
+; CHECK-NEXT: and.16b v1, v2, v3
+; CHECK-NEXT: and.16b v0, v0, v4
; CHECK-NEXT: ret
%ext = zext <8 x i8> %a to <8 x i32>
%cmp = icmp ugt <8 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -164,14 +156,13 @@ define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32(<8 x i8>
define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_2(<8 x i16> %a) {
; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_2:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.8h v1, #10
-; CHECK-NEXT: ushll.4s v2, v0, #0
-; CHECK-NEXT: cmhi.8h v1, v0, v1
-; CHECK-NEXT: ushll2.4s v0, v0, #0
-; CHECK-NEXT: sshll2.4s v3, v1, #0
-; CHECK-NEXT: sshll.4s v4, v1, #0
-; CHECK-NEXT: and.16b v1, v0, v3
-; CHECK-NEXT: and.16b v0, v2, v4
+; CHECK-NEXT: movi.4s v1, #10
+; CHECK-NEXT: ushll2.4s v2, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: cmhi.4s v3, v2, v1
+; CHECK-NEXT: cmhi.4s v4, v0, v1
+; CHECK-NEXT: and.16b v1, v2, v3
+; CHECK-NEXT: and.16b v0, v0, v4
; CHECK-NEXT: ret
%ext = zext <8 x i16> %a to <8 x i32>
%cmp = icmp ugt <8 x i16> %a, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
@@ -183,19 +174,14 @@ define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_2(<8 x i1
define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15(<8 x i15> %a) {
; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.8h v1, #10
+; CHECK-NEXT: movi.4s v1, #10
; CHECK-NEXT: bic.8h v0, #128, lsl #8
-; CHECK-NEXT: ushll.4s v2, v0, #0
-; CHECK-NEXT: cmhi.8h v1, v0, v1
-; CHECK-NEXT: ushll2.4s v0, v0, #0
-; CHECK-NEXT: ushll2.4s v3, v1, #0
-; CHECK-NEXT: ushll.4s v1, v1, #0
-; CHECK-NEXT: shl.4s v3, v3, #17
-; CHECK-NEXT: shl.4s v1, v1, #17
-; CHECK-NEXT: sshr.4s v3, v3, #17
-; CHECK-NEXT: sshr.4s v4, v1, #17
-; CHECK-NEXT: and.16b v1, v0, v3
-; CHECK-NEXT: and.16b v0, v2, v4
+; CHECK-NEXT: ushll2.4s v2, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: cmhi.4s v3, v2, v1
+; CHECK-NEXT: cmhi.4s v4, v0, v1
+; CHECK-NEXT: and.16b v1, v2, v3
+; CHECK-NEXT: and.16b v0, v0, v4
; CHECK-NEXT: ret
%ext = zext <8 x i15> %a to <8 x i32>
%cmp = icmp ugt <8 x i15> %a, <i15 10, i15 10, i15 10, i15 10, i15 10, i15 10, i15 10, i15 10>
@@ -256,11 +242,10 @@ define <3 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v3i16(<3 x i8>
define <4 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v4i32(<4 x i16> %a) {
; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v4i32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.4h v1, #10
-; CHECK-NEXT: ushll.4s v2, v0, #0
-; CHECK-NEXT: cmhi.4h v0, v0, v1
-; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: and.16b v0, v2, v0
+; CHECK-NEXT: movi.4s v1, #10
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: cmhi.4s v1, v0, v1
+; CHECK-NEXT: and.16b v0, v0, v1
; CHECK-NEXT: ret
%ext = zext <4 x i16> %a to <4 x i32>
%cmp = icmp ugt <4 x i16> %a, <i16 10, i16 10, i16 10, i16 10>
@@ -286,14 +271,13 @@ define <2 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v2i32(<2 x i16>
define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) {
; CHECK-LABEL: same_zext_used_in_cmp_eq_and_select_v8i32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.8h v1, #10
-; CHECK-NEXT: ushll.4s v2, v0, #0
-; CHECK-NEXT: cmeq.8h v1, v0, v1
-; CHECK-NEXT: ushll2.4s v0, v0, #0
-; CHECK-NEXT: sshll2.4s v3, v1, #0
-; CHECK-NEXT: sshll.4s v4, v1, #0
-; CHECK-NEXT: and.16b v1, v0, v3
-; CHECK-NEXT: and.16b v0, v2, v4
+; CHECK-NEXT: movi.4s v1, #10
+; CHECK-NEXT: ushll2.4s v2, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: cmeq.4s v3, v2, v1
+; CHECK-NEXT: cmeq.4s v4, v0, v1
+; CHECK-NEXT: and.16b v1, v2, v3
+; CHECK-NEXT: and.16b v0, v0, v4
; CHECK-NEXT: ret
%ext = zext <8 x i16> %a to <8 x i32>
%cmp = icmp eq <8 x i16> %a, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
@@ -304,19 +288,14 @@ define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) {
define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13> %a) {
; CHECK-LABEL: same_zext_used_in_cmp_eq_and_select_v8i32_from_v8i13:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.8h v1, #10
+; CHECK-NEXT: movi.4s v1, #10
; CHECK-NEXT: bic.8h v0, #224, lsl #8
-; CHECK-NEXT: ushll.4s v2, v0, #0
-; CHECK-NEXT: cmeq.8h v1, v0, v1
-; CHECK-NEXT: ushll2.4s v0, v0, #0
-; CHECK-NEXT: ushll2.4s v3, v1, #0
-; CHECK-NEXT: ushll.4s v1, v1, #0
-; CHECK-NEXT: shl.4s v3, v3, #19
-; CHECK-NEXT: shl.4s v1, v1, #19
-; CHECK-NEXT: sshr.4s v3, v3, #19
-; CHECK-NEXT: sshr.4s v4, v1, #19
-; CHECK-NEXT: and.16b v1, v0, v3
-; CHECK-NEXT: and.16b v0, v2, v4
+; CHECK-NEXT: ushll2.4s v2, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: cmeq.4s v3, v2, v1
+; CHECK-NEXT: cmeq.4s v4, v0, v1
+; CHECK-NEXT: and.16b v1, v2, v3
+; CHECK-NEXT: and.16b v0, v0, v4
; CHECK-NEXT: ret
%ext = zext <8 x i13> %a to <8 x i32>
%cmp = icmp eq <8 x i13> %a, <i13 10, i13 10, i13 10, i13 10, i13 10, i13 10, i13 10, i13 10>
@@ -327,25 +306,21 @@ define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13>
define <16 x i32> @same_zext_used_in_cmp_ne_and_select_v8i32(<16 x i8> %a) {
; CHECK-LABEL: same_zext_used_in_cmp_ne_and_select_v8i32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.16b v1, #10
-; CHECK-NEXT: ushll.8h v2, v0, #0
-; CHECK-NEXT: ushll2.8h v3, v0, #0
-; CHECK-NEXT: ushll.4s v4, v2, #0
-; CHECK-NEXT: cmeq.16b v0, v0, v1
-; CHECK-NEXT: ushll.4s v5, v3, #0
-; CHECK-NEXT: mvn.16b v0, v0
-; CHECK-NEXT: ushll2.4s v1, v3, #0
-; CHECK-NEXT: sshll.8h v3, v0, #0
-; CHECK-NEXT: sshll2.8h v0, v0, #0
-; CHECK-NEXT: ushll2.4s v2, v2, #0
-; CHECK-NEXT: sshll.4s v6, v3, #0
-; CHECK-NEXT: sshll.4s v7, v0, #0
-; CHECK-NEXT: sshll2.4s v0, v0, #0
-; CHECK-NEXT: sshll2.4s v16, v3, #0
-; CHECK-NEXT: and.16b v3, v1, v0
-; CHECK-NEXT: and.16b v1, v2, v16
-; CHECK-NEXT: and.16b v2, v5, v7
-; CHECK-NEXT: and.16b v0, v4, v6
+; CHECK-NEXT: movi.4s v1, #10
+; CHECK-NEXT: ushll2.8h v2, v0, #0
+; CHECK-NEXT: ushll.8h v0, v0, #0
+; CHECK-NEXT: ushll2.4s v3, v2, #0
+; CHECK-NEXT: ushll2.4s v4, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ushll.4s v2, v2, #0
+; CHECK-NEXT: cmeq.4s v5, v0, v1
+; CHECK-NEXT: cmeq.4s v6, v2, v1
+; CHECK-NEXT: cmeq.4s v7, v3, v1
+; CHECK-NEXT: cmeq.4s v1, v4, v1
+; CHECK-NEXT: bic.16b v3, v3, v7
+; CHECK-NEXT: bic.16b v1, v4, v1
+; CHECK-NEXT: bic.16b v2, v2, v6
+; CHECK-NEXT: bic.16b v0, v0, v5
; CHECK-NEXT: ret
%ext = zext <16 x i8> %a to <16 x i32>
%cmp = icmp ne <16 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -412,24 +387,21 @@ entry:
define <16 x i32> @same_sext_used_in_cmp_signed_pred_and_select_v16i32(<16 x i8> %a) {
; CHECK-LABEL: same_sext_used_in_cmp_signed_pred_and_select_v16i32:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: movi.16b v1, #10
-; CHECK-NEXT: sshll.8h v3, v0, #0
+; CHECK-NEXT: movi.4s v1, #10
; CHECK-NEXT: sshll2.8h v2, v0, #0
-; CHECK-NEXT: cmgt.16b v0, v0, v1
-; CHECK-NEXT: ext.16b v1, v3, v3, #8
-; CHECK-NEXT: sshll.8h v5, v0, #0
-; CHECK-NEXT: sshll2.8h v0, v0, #0
-; CHECK-NEXT: ext.16b v4, v2, v2, #8
-; CHECK-NEXT: ext.16b v6, v5, v5, #8
-; CHECK-NEXT: ext.16b v7, v0, v0, #8
-; CHECK-NEXT: and.8b v0, v2, v0
-; CHECK-NEXT: sshll.4s v2, v0, #0
-; CHECK-NEXT: and.8b v0, v3, v5
-; CHECK-NEXT: and.8b v1, v1, v6
-; CHECK-NEXT: and.8b v3, v4, v7
+; CHECK-NEXT: sshll.8h v0, v0, #0
+; CHECK-NEXT: sshll2.4s v3, v2, #0
+; CHECK-NEXT: sshll2.4s v4, v0, #0
; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: sshll.4s v1, v1, #0
-; CHECK-NEXT: sshll.4s v3, v3, #0
+; CHECK-NEXT: sshll.4s v2, v2, #0
+; CHECK-NEXT: cmgt.4s v5, v0, v1
+; CHECK-NEXT: cmgt.4s v6, v2, v1
+; CHECK-NEXT: cmgt.4s v7, v3, v1
+; CHECK-NEXT: cmgt.4s v1, v4, v1
+; CHECK-NEXT: and.16b v3, v3, v7
+; CHECK-NEXT: and.16b v1, v4, v1
+; CHECK-NEXT: and.16b v2, v2, v6
+; CHECK-NEXT: and.16b v0, v0, v5
; CHECK-NEXT: ret
entry:
%ext = sext <16 x i8> %a to <16 x i32>
@@ -441,14 +413,13 @@ entry:
define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) {
; CHECK-LABEL: same_sext_used_in_cmp_eq_and_select_v8i32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.8h v1, #10
-; CHECK-NEXT: ext.16b v2, v0, v0, #8
-; CHECK-NEXT: cmeq.8h v1, v0, v1
-; CHECK-NEXT: ext.16b v3, v1, v1, #8
-; CHECK-NEXT: and.8b v0, v0, v1
+; CHECK-NEXT: movi.4s v1, #10
+; CHECK-NEXT: sshll2.4s v2, v0, #0
; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: and.8b v1, v2, v3
-; CHECK-NEXT: sshll.4s v1, v1, #0
+; CHECK-NEXT: cmeq.4s v3, v2, v1
+; CHECK-NEXT: cmeq.4s v4, v0, v1
+; CHECK-NEXT: and.16b v1, v2, v3
+; CHECK-NEXT: and.16b v0, v0, v4
; CHECK-NEXT: ret
%ext = sext <8 x i16> %a to <8 x i32>
%cmp = icmp eq <8 x i16> %a, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
@@ -459,24 +430,17 @@ define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) {
define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13> %a) {
; CHECK-LABEL: same_sext_used_in_cmp_eq_and_select_v8i32_from_v8i13:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.8h v1, #10
-; CHECK-NEXT: mov.16b v2, v0
-; CHECK-NEXT: bic.8h v2, #224, lsl #8
-; CHECK-NEXT: ushll2.4s v3, v0, #0
-; CHECK-NEXT: cmeq.8h v1, v2, v1
+; CHECK-NEXT: ushll2.4s v2, v0, #0
; CHECK-NEXT: ushll.4s v0, v0, #0
-; CHECK-NEXT: ushll.4s v2, v1, #0
-; CHECK-NEXT: ushll2.4s v1, v1, #0
-; CHECK-NEXT: shl.4s v3, v3, #19
-; CHECK-NEXT: shl.4s v1, v1, #19
-; CHECK-NEXT: shl.4s v0, v0, #19
+; CHECK-NEXT: movi.4s v1, #10
; CHECK-NEXT: shl.4s v2, v2, #19
-; CHECK-NEXT: sshr.4s v3, v3, #19
-; CHECK-NEXT: sshr.4s v1, v1, #19
-; CHECK-NEXT: sshr.4s v0, v0, #19
+; CHECK-NEXT: shl.4s v0, v0, #19
; CHECK-NEXT: sshr.4s v2, v2, #19
-; CHECK-NEXT: and.16b v1, v3, v1
-; CHECK-NEXT: and.16b v0, v0, v2
+; CHECK-NEXT: sshr.4s v0, v0, #19
+; CHECK-NEXT: cmeq.4s v3, v2, v1
+; CHECK-NEXT: cmeq.4s v4, v0, v1
+; CHECK-NEXT: and.16b v1, v2, v3
+; CHECK-NEXT: and.16b v0, v0, v4
; CHECK-NEXT: ret
%ext = sext <8 x i13> %a to <8 x i32>
%cmp = icmp eq <8 x i13> %a, <i13 10, i13 10, i13 10, i13 10, i13 10, i13 10, i13 10, i13 10>
@@ -487,25 +451,21 @@ define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13>
define <16 x i32> @same_sext_used_in_cmp_ne_and_select_v8i32(<16 x i8> %a) {
; CHECK-LABEL: same_sext_used_in_cmp_ne_and_select_v8i32:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.16b v1, #10
-; CHECK-NEXT: sshll.8h v3, v0, #0
+; CHECK-NEXT: movi.4s v1, #10
; CHECK-NEXT: sshll2.8h v2, v0, #0
-; CHECK-NEXT: cmeq.16b v0, v0, v1
-; CHECK-NEXT: ext.16b v1, v3, v3, #8
-; CHECK-NEXT: mvn.16b v0, v0
-; CHECK-NEXT: sshll.8h v5, v0, #0
-; CHECK-NEXT: sshll2.8h v0, v0, #0
-; CHECK-NEXT: ext.16b v4, v2, v2, #8
-; CHECK-NEXT: ext.16b v6, v5, v5, #8
-; CHECK-NEXT: ext.16b v7, v0, v0, #8
-; CHECK-NEXT: and.8b v0, v2, v0
-; CHECK-NEXT: sshll.4s v2, v0, #0
-; CHECK-NEXT: and.8b v0, v3, v5
-; CHECK-NEXT: and.8b v1, v1, v6
-; CHECK-NEXT: and.8b v3, v4, v7
+; CHECK-NEXT: sshll.8h v0, v0, #0
+; CHECK-NEXT: sshll2.4s v3, v2, #0
+; CHECK-NEXT: sshll2.4s v4, v0, #0
; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: sshll.4s v1, v1, #0
-; CHECK-NEXT: sshll.4s v3, v3, #0
+; CHECK-NEXT: sshll.4s v2, v2, #0
+; CHECK-NEXT: cmeq.4s v5, v0, v1
+; CHECK-NEXT: cmeq.4s v6, v2, v1
+; CHECK-NEXT: cmeq.4s v7, v3, v1
+; CHECK-NEXT: cmeq.4s v1, v4, v1
+; CHECK-NEXT: bic.16b v3, v3, v7
+; CHECK-NEXT: bic.16b v1, v4, v1
+; CHECK-NEXT: bic.16b v2, v2, v6
+; CHECK-NEXT: bic.16b v0, v0, v5
; CHECK-NEXT: ret
%ext = sext <16 x i8> %a to <16 x i32>
%cmp = icmp ne <16 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -516,14 +476,13 @@ define <16 x i32> @same_sext_used_in_cmp_ne_and_select_v8i32(<16 x i8> %a) {
define <8 x i32> @same_sext_used_in_cmp_signed_pred_and_select_v8i32(<8 x i16> %a) {
; CHECK-LABEL: same_sext_used_in_cmp_signed_pred_and_select_v8i32:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: movi.8h v1, #10
-; CHECK-NEXT: ext.16b v2, v0, v0, #8
-; CHECK-NEXT: cmgt.8h v1, v0, v1
-; CHECK-NEXT: ext.16b v3, v1, v1, #8
-; CHECK-NEXT: and.8b v0, v0, v1
+; CHECK-NEXT: movi.4s v1, #10
+; CHECK-NEXT: sshll2.4s v2, v0, #0
; CHECK-NEXT: sshll.4s v0, v0, #0
-; CHECK-NEXT: and.8b v1, v2, v3
-; CHECK-NEXT: sshll.4s v1, v1, #0
+; CHECK-NEXT: cmgt.4s v3, v2, v1
+; CHECK-NEXT: cmgt.4s v4, v0, v1
+; CHECK-NEXT: and.16b v1, v2, v3
+; CHECK-NEXT: and.16b v0, v0, v4
; CHECK-NEXT: ret
entry:
%ext = sext <8 x i16> %a to <8 x i32>
@@ -535,24 +494,17 @@ entry:
define <8 x i32> @same_sext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15(<8 x i15> %a) {
; CHECK-LABEL: same_sext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15:
; CHECK: ; %bb.0:
-; CHECK-NEXT: movi.8h v1, #10
-; CHECK-NEXT: shl.8h v2, v0, #1
-; CHECK-NEXT: ushll2.4s v3, v0, #0
-; CHECK-NEXT: sshr.8h v2, v2, #1
+; CHECK-NEXT: ushll2.4s v2, v0, #0
; CHECK-NEXT: ushll.4s v0, v0, #0
-; CHECK-NEXT: shl.4s v3, v3, #17
-; CHECK-NEXT: cmge.8h v1, v2, v1
-; CHECK-NEXT: shl.4s v0, v0, #17
-; CHECK-NEXT: ushll.4s v2, v1, #0
-; CHECK-NEXT: sshr.4s v3, v3, #17
-; CHECK-NEXT: ushll2.4s v1, v1, #0
+; CHECK-NEXT: movi.4s v1, #10
; CHECK-NEXT: shl.4s v2, v2, #17
-; CHECK-NEXT: shl.4s v1, v1, #17
-; CHECK-NEXT: sshr.4s v0, v0, #17
+; CHECK-NEXT: shl.4s v0, v0, #17
; CHECK-NEXT: sshr.4s v2, v2, #17
-; CHECK-NEXT: sshr.4s v1, v1, #17
-; CHECK-NEXT: and.16b v0, v0, v2
-; CHECK-NEXT: and.16b v1, v3, v1
+; CHECK-NEXT: sshr.4s v0, v0, #17
+; CHECK-NEXT: cmge.4s v3, v2, v1
+; CHECK-NEXT: cmge.4s v4, v0, v1
+; CHECK-NEXT: and.16b v1, v2, v3
+; CHECK-NEXT: and.16b v0, v0, v4
; CHECK-NEXT: ret
%ext = sext <8 x i15> %a to <8 x i32>
%cmp = icmp sge <8 x i15> %a, <i15 10, i15 10, i15 10, i15 10, i15 10, i15 10, i15 10, i15 10>
More information about the llvm-commits
mailing list