[llvm] afdedd4 - [AArch64] Try to re-use extended operand for SETCC with vector ops.

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 7 16:51:14 PDT 2022


Author: Florian Hahn
Date: 2022-07-07T16:50:00-07:00
New Revision: afdedd405e493dc80bd5ceb9133b9d3a8bc69f2c

URL: https://github.com/llvm/llvm-project/commit/afdedd405e493dc80bd5ceb9133b9d3a8bc69f2c
DIFF: https://github.com/llvm/llvm-project/commit/afdedd405e493dc80bd5ceb9133b9d3a8bc69f2c.diff

LOG: [AArch64] Try to re-use extended operand for SETCC with vector ops.

Try to re-use an already extended operand for SetCC with vector operands
feeding an extended select. Doing so avoids requiring another full
extension of the SET_CC result when lowering the select.

This improves lowering for certain extend/cmp/select patterns operating.
For example with  v16i8, this replaces 6 instructions for the extra extension
with 4 separate selects.

This improves the generated code for loops like the one below in
combination with D96522.

    int foo(uint8_t *p, int N) {
      unsigned long long sum = 0;
      for (int i = 0; i < N ; i++, p++) {
        unsigned int v = *p;
        sum += (v < 127) ? v : 256 - v;
      }
      return sum;
    }

https://clang.godbolt.org/z/Wco866MjY

On the AArch64 cores I have access to, the patch improves performance of
the vector loop by ~10%.

This could be generalized per follow-ups, but the initial version
targets one of the more important cases in combination with D96522.

Alive2 modeling:
* sext EQ https://alive2.llvm.org/ce/z/5upBvb
* sext NE https://alive2.llvm.org/ce/z/zbEcJp
* zext EQ https://alive2.llvm.org/ce/z/_xMwof
* zext NE https://alive2.llvm.org/ce/z/5FwKfc
* zext unsigned predicate: https://alive2.llvm.org/ce/z/iEwLU3
* sext signed predicate: https://alive2.llvm.org/ce/z/aMBega

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D120481

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/vselect-ext.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8ac99cfc42954..ff030569eb3c1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18065,6 +18065,54 @@ static SDValue performCSELCombine(SDNode *N,
   return performCONDCombine(N, DCI, DAG, 2, 3);
 }
 
+// Try to re-use an already extended operand of a vector SetCC feeding a
+// extended select. Doing so avoids requiring another full extension of the
+// SET_CC result when lowering the select.
+static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
+  EVT Op0MVT = Op->getOperand(0).getValueType();
+  if (!Op0MVT.isVector() || Op->use_empty())
+    return SDValue();
+
+  // Make sure that all uses of Op are VSELECTs with result matching types where
+  // the result type has a larger element type than the SetCC operand.
+  SDNode *FirstUse = *Op->use_begin();
+  if (FirstUse->getOpcode() != ISD::VSELECT)
+    return SDValue();
+  EVT UseMVT = FirstUse->getValueType(0);
+  if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
+    return SDValue();
+  if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
+        return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
+      }))
+    return SDValue();
+
+  APInt V;
+  if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue Op0ExtV;
+  SDValue Op1ExtV;
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
+  // Check if the first operand of the SET_CC is already extended. If it is,
+  // split the SET_CC and re-use the extended version of the operand.
+  SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
+                                        Op->getOperand(0));
+  SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
+                                        Op->getOperand(0));
+  if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
+    Op0ExtV = SDValue(Op0SExt, 0);
+    Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
+  } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
+    Op0ExtV = SDValue(Op0ZExt, 0);
+    Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
+  } else
+    return SDValue();
+
+  return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
+                     Op0ExtV, Op1ExtV, Op->getOperand(2));
+}
+
 static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
   SDValue LHS = N->getOperand(0);
@@ -18073,6 +18121,9 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
+  if (SDValue V = tryToWidenSetCCOperands(N, DAG))
+    return V;
+
   // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
   if (Cond == ISD::SETNE && isOneConstant(RHS) &&
       LHS->getOpcode() == AArch64ISD::CSEL &&

diff  --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll
index b3066e5247901..fa0db82b7a9a4 100644
--- a/llvm/test/CodeGen/AArch64/vselect-ext.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll
@@ -85,26 +85,23 @@ entry:
 define <8 x i64> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i64(<8 x i8> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v8i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.8b v1, #10
-; CHECK-NEXT:    ushll.8h v2, v0, #0
-; CHECK-NEXT:    cmhi.8b v0, v0, v1
-; CHECK-NEXT:    ushll.4s v1, v2, #0
-; CHECK-NEXT:    sshll.8h v0, v0, #0
-; CHECK-NEXT:    ushll2.4s v2, v2, #0
-; CHECK-NEXT:    sshll.4s v3, v0, #0
-; CHECK-NEXT:    sshll2.4s v0, v0, #0
-; CHECK-NEXT:    ushll.2d v4, v1, #0
-; CHECK-NEXT:    ushll.2d v5, v2, #0
-; CHECK-NEXT:    ushll2.2d v1, v1, #0
-; CHECK-NEXT:    ushll2.2d v2, v2, #0
-; CHECK-NEXT:    sshll.2d v6, v3, #0
-; CHECK-NEXT:    sshll.2d v7, v0, #0
-; CHECK-NEXT:    sshll2.2d v0, v0, #0
-; CHECK-NEXT:    sshll2.2d v16, v3, #0
-; CHECK-NEXT:    and.16b v3, v2, v0
-; CHECK-NEXT:    and.16b v1, v1, v16
-; CHECK-NEXT:    and.16b v2, v5, v7
-; CHECK-NEXT:    and.16b v0, v4, v6
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    mov w8, #10
+; CHECK-NEXT:    ushll2.4s v2, v0, #0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    dup.2d v1, x8
+; CHECK-NEXT:    ushll2.2d v3, v2, #0
+; CHECK-NEXT:    ushll2.2d v4, v0, #0
+; CHECK-NEXT:    ushll.2d v0, v0, #0
+; CHECK-NEXT:    ushll.2d v2, v2, #0
+; CHECK-NEXT:    cmhi.2d v5, v0, v1
+; CHECK-NEXT:    cmhi.2d v6, v2, v1
+; CHECK-NEXT:    cmhi.2d v7, v3, v1
+; CHECK-NEXT:    cmhi.2d v1, v4, v1
+; CHECK-NEXT:    and.16b v3, v3, v7
+; CHECK-NEXT:    and.16b v1, v4, v1
+; CHECK-NEXT:    and.16b v2, v2, v6
+; CHECK-NEXT:    and.16b v0, v0, v5
 ; CHECK-NEXT:    ret
   %ext = zext <8 x i8> %a to <8 x i64>
   %cmp = icmp ugt <8 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -116,24 +113,21 @@ define <8 x i64> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i64(<8 x i8>
 define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v16i32(<16 x i8> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v16i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.16b v1, #10
-; CHECK-NEXT:    ushll.8h v2, v0, #0
-; CHECK-NEXT:    ushll2.8h v3, v0, #0
-; CHECK-NEXT:    ushll.4s v4, v2, #0
-; CHECK-NEXT:    cmhi.16b v0, v0, v1
-; CHECK-NEXT:    ushll.4s v5, v3, #0
-; CHECK-NEXT:    ushll2.4s v1, v3, #0
-; CHECK-NEXT:    sshll.8h v3, v0, #0
-; CHECK-NEXT:    sshll2.8h v0, v0, #0
-; CHECK-NEXT:    ushll2.4s v2, v2, #0
-; CHECK-NEXT:    sshll.4s v6, v3, #0
-; CHECK-NEXT:    sshll.4s v7, v0, #0
-; CHECK-NEXT:    sshll2.4s v0, v0, #0
-; CHECK-NEXT:    sshll2.4s v16, v3, #0
-; CHECK-NEXT:    and.16b v3, v1, v0
-; CHECK-NEXT:    and.16b v1, v2, v16
-; CHECK-NEXT:    and.16b v2, v5, v7
-; CHECK-NEXT:    and.16b v0, v4, v6
+; CHECK-NEXT:    movi.4s v1, #10
+; CHECK-NEXT:    ushll2.8h v2, v0, #0
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    ushll2.4s v3, v2, #0
+; CHECK-NEXT:    ushll2.4s v4, v0, #0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    ushll.4s v2, v2, #0
+; CHECK-NEXT:    cmhi.4s v5, v0, v1
+; CHECK-NEXT:    cmhi.4s v6, v2, v1
+; CHECK-NEXT:    cmhi.4s v7, v3, v1
+; CHECK-NEXT:    cmhi.4s v1, v4, v1
+; CHECK-NEXT:    and.16b v3, v3, v7
+; CHECK-NEXT:    and.16b v1, v4, v1
+; CHECK-NEXT:    and.16b v2, v2, v6
+; CHECK-NEXT:    and.16b v0, v0, v5
 ; CHECK-NEXT:    ret
   %ext = zext <16 x i8> %a to <16 x i32>
   %cmp = icmp ugt <16 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -144,16 +138,14 @@ define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v16i32(<16 x i
 define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32(<8 x i8> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v8i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.8b v1, #10
-; CHECK-NEXT:    ushll.8h v2, v0, #0
-; CHECK-NEXT:    ushll.4s v3, v2, #0
-; CHECK-NEXT:    cmhi.8b v0, v0, v1
-; CHECK-NEXT:    ushll2.4s v1, v2, #0
-; CHECK-NEXT:    sshll.8h v0, v0, #0
-; CHECK-NEXT:    sshll2.4s v2, v0, #0
-; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    and.16b v1, v1, v2
-; CHECK-NEXT:    and.16b v0, v3, v0
+; CHECK-NEXT:    movi.4s v1, #10
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    ushll2.4s v2, v0, #0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    cmhi.4s v3, v2, v1
+; CHECK-NEXT:    cmhi.4s v4, v0, v1
+; CHECK-NEXT:    and.16b v1, v2, v3
+; CHECK-NEXT:    and.16b v0, v0, v4
 ; CHECK-NEXT:    ret
   %ext = zext <8 x i8> %a to <8 x i32>
   %cmp = icmp ugt <8 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -164,14 +156,13 @@ define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32(<8 x i8>
 define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_2(<8 x i16> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_2:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.8h v1, #10
-; CHECK-NEXT:    ushll.4s v2, v0, #0
-; CHECK-NEXT:    cmhi.8h v1, v0, v1
-; CHECK-NEXT:    ushll2.4s v0, v0, #0
-; CHECK-NEXT:    sshll2.4s v3, v1, #0
-; CHECK-NEXT:    sshll.4s v4, v1, #0
-; CHECK-NEXT:    and.16b v1, v0, v3
-; CHECK-NEXT:    and.16b v0, v2, v4
+; CHECK-NEXT:    movi.4s v1, #10
+; CHECK-NEXT:    ushll2.4s v2, v0, #0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    cmhi.4s v3, v2, v1
+; CHECK-NEXT:    cmhi.4s v4, v0, v1
+; CHECK-NEXT:    and.16b v1, v2, v3
+; CHECK-NEXT:    and.16b v0, v0, v4
 ; CHECK-NEXT:    ret
   %ext = zext <8 x i16> %a to <8 x i32>
   %cmp = icmp ugt <8 x i16> %a, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
@@ -183,19 +174,14 @@ define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_2(<8 x i1
 define <8 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15(<8 x i15> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.8h v1, #10
+; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    bic.8h v0, #128, lsl #8
-; CHECK-NEXT:    ushll.4s v2, v0, #0
-; CHECK-NEXT:    cmhi.8h v1, v0, v1
-; CHECK-NEXT:    ushll2.4s v0, v0, #0
-; CHECK-NEXT:    ushll2.4s v3, v1, #0
-; CHECK-NEXT:    ushll.4s v1, v1, #0
-; CHECK-NEXT:    shl.4s v3, v3, #17
-; CHECK-NEXT:    shl.4s v1, v1, #17
-; CHECK-NEXT:    sshr.4s v3, v3, #17
-; CHECK-NEXT:    sshr.4s v4, v1, #17
-; CHECK-NEXT:    and.16b v1, v0, v3
-; CHECK-NEXT:    and.16b v0, v2, v4
+; CHECK-NEXT:    ushll2.4s v2, v0, #0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    cmhi.4s v3, v2, v1
+; CHECK-NEXT:    cmhi.4s v4, v0, v1
+; CHECK-NEXT:    and.16b v1, v2, v3
+; CHECK-NEXT:    and.16b v0, v0, v4
 ; CHECK-NEXT:    ret
   %ext = zext <8 x i15> %a to <8 x i32>
   %cmp = icmp ugt <8 x i15> %a, <i15 10, i15 10, i15 10, i15 10, i15 10, i15 10, i15 10, i15 10>
@@ -256,11 +242,10 @@ define <3 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v3i16(<3 x i8>
 define <4 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v4i32(<4 x i16> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_v4i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.4h v1, #10
-; CHECK-NEXT:    ushll.4s v2, v0, #0
-; CHECK-NEXT:    cmhi.4h v0, v0, v1
-; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    and.16b v0, v2, v0
+; CHECK-NEXT:    movi.4s v1, #10
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    cmhi.4s v1, v0, v1
+; CHECK-NEXT:    and.16b v0, v0, v1
 ; CHECK-NEXT:    ret
   %ext = zext <4 x i16> %a to <4 x i32>
   %cmp = icmp ugt <4 x i16> %a, <i16 10, i16 10, i16 10, i16 10>
@@ -286,14 +271,13 @@ define <2 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_v2i32(<2 x i16>
 define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_eq_and_select_v8i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.8h v1, #10
-; CHECK-NEXT:    ushll.4s v2, v0, #0
-; CHECK-NEXT:    cmeq.8h v1, v0, v1
-; CHECK-NEXT:    ushll2.4s v0, v0, #0
-; CHECK-NEXT:    sshll2.4s v3, v1, #0
-; CHECK-NEXT:    sshll.4s v4, v1, #0
-; CHECK-NEXT:    and.16b v1, v0, v3
-; CHECK-NEXT:    and.16b v0, v2, v4
+; CHECK-NEXT:    movi.4s v1, #10
+; CHECK-NEXT:    ushll2.4s v2, v0, #0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    cmeq.4s v3, v2, v1
+; CHECK-NEXT:    cmeq.4s v4, v0, v1
+; CHECK-NEXT:    and.16b v1, v2, v3
+; CHECK-NEXT:    and.16b v0, v0, v4
 ; CHECK-NEXT:    ret
   %ext = zext <8 x i16> %a to <8 x i32>
   %cmp = icmp eq <8 x i16> %a, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
@@ -304,19 +288,14 @@ define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) {
 define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_eq_and_select_v8i32_from_v8i13:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.8h v1, #10
+; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    bic.8h v0, #224, lsl #8
-; CHECK-NEXT:    ushll.4s v2, v0, #0
-; CHECK-NEXT:    cmeq.8h v1, v0, v1
-; CHECK-NEXT:    ushll2.4s v0, v0, #0
-; CHECK-NEXT:    ushll2.4s v3, v1, #0
-; CHECK-NEXT:    ushll.4s v1, v1, #0
-; CHECK-NEXT:    shl.4s v3, v3, #19
-; CHECK-NEXT:    shl.4s v1, v1, #19
-; CHECK-NEXT:    sshr.4s v3, v3, #19
-; CHECK-NEXT:    sshr.4s v4, v1, #19
-; CHECK-NEXT:    and.16b v1, v0, v3
-; CHECK-NEXT:    and.16b v0, v2, v4
+; CHECK-NEXT:    ushll2.4s v2, v0, #0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    cmeq.4s v3, v2, v1
+; CHECK-NEXT:    cmeq.4s v4, v0, v1
+; CHECK-NEXT:    and.16b v1, v2, v3
+; CHECK-NEXT:    and.16b v0, v0, v4
 ; CHECK-NEXT:    ret
   %ext = zext <8 x i13> %a to <8 x i32>
   %cmp = icmp eq <8 x i13> %a, <i13 10, i13 10, i13 10, i13 10, i13 10, i13 10, i13 10, i13 10>
@@ -327,25 +306,21 @@ define <8 x i32> @same_zext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13>
 define <16 x i32> @same_zext_used_in_cmp_ne_and_select_v8i32(<16 x i8> %a) {
 ; CHECK-LABEL: same_zext_used_in_cmp_ne_and_select_v8i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.16b v1, #10
-; CHECK-NEXT:    ushll.8h v2, v0, #0
-; CHECK-NEXT:    ushll2.8h v3, v0, #0
-; CHECK-NEXT:    ushll.4s v4, v2, #0
-; CHECK-NEXT:    cmeq.16b v0, v0, v1
-; CHECK-NEXT:    ushll.4s v5, v3, #0
-; CHECK-NEXT:    mvn.16b v0, v0
-; CHECK-NEXT:    ushll2.4s v1, v3, #0
-; CHECK-NEXT:    sshll.8h v3, v0, #0
-; CHECK-NEXT:    sshll2.8h v0, v0, #0
-; CHECK-NEXT:    ushll2.4s v2, v2, #0
-; CHECK-NEXT:    sshll.4s v6, v3, #0
-; CHECK-NEXT:    sshll.4s v7, v0, #0
-; CHECK-NEXT:    sshll2.4s v0, v0, #0
-; CHECK-NEXT:    sshll2.4s v16, v3, #0
-; CHECK-NEXT:    and.16b v3, v1, v0
-; CHECK-NEXT:    and.16b v1, v2, v16
-; CHECK-NEXT:    and.16b v2, v5, v7
-; CHECK-NEXT:    and.16b v0, v4, v6
+; CHECK-NEXT:    movi.4s v1, #10
+; CHECK-NEXT:    ushll2.8h v2, v0, #0
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    ushll2.4s v3, v2, #0
+; CHECK-NEXT:    ushll2.4s v4, v0, #0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    ushll.4s v2, v2, #0
+; CHECK-NEXT:    cmeq.4s v5, v0, v1
+; CHECK-NEXT:    cmeq.4s v6, v2, v1
+; CHECK-NEXT:    cmeq.4s v7, v3, v1
+; CHECK-NEXT:    cmeq.4s v1, v4, v1
+; CHECK-NEXT:    bic.16b v3, v3, v7
+; CHECK-NEXT:    bic.16b v1, v4, v1
+; CHECK-NEXT:    bic.16b v2, v2, v6
+; CHECK-NEXT:    bic.16b v0, v0, v5
 ; CHECK-NEXT:    ret
   %ext = zext <16 x i8> %a to <16 x i32>
   %cmp = icmp ne <16 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -412,24 +387,21 @@ entry:
 define <16 x i32> @same_sext_used_in_cmp_signed_pred_and_select_v16i32(<16 x i8> %a) {
 ; CHECK-LABEL: same_sext_used_in_cmp_signed_pred_and_select_v16i32:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    movi.16b v1, #10
-; CHECK-NEXT:    sshll.8h v3, v0, #0
+; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    sshll2.8h v2, v0, #0
-; CHECK-NEXT:    cmgt.16b v0, v0, v1
-; CHECK-NEXT:    ext.16b v1, v3, v3, #8
-; CHECK-NEXT:    sshll.8h v5, v0, #0
-; CHECK-NEXT:    sshll2.8h v0, v0, #0
-; CHECK-NEXT:    ext.16b v4, v2, v2, #8
-; CHECK-NEXT:    ext.16b v6, v5, v5, #8
-; CHECK-NEXT:    ext.16b v7, v0, v0, #8
-; CHECK-NEXT:    and.8b v0, v2, v0
-; CHECK-NEXT:    sshll.4s v2, v0, #0
-; CHECK-NEXT:    and.8b v0, v3, v5
-; CHECK-NEXT:    and.8b v1, v1, v6
-; CHECK-NEXT:    and.8b v3, v4, v7
+; CHECK-NEXT:    sshll.8h v0, v0, #0
+; CHECK-NEXT:    sshll2.4s v3, v2, #0
+; CHECK-NEXT:    sshll2.4s v4, v0, #0
 ; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    sshll.4s v1, v1, #0
-; CHECK-NEXT:    sshll.4s v3, v3, #0
+; CHECK-NEXT:    sshll.4s v2, v2, #0
+; CHECK-NEXT:    cmgt.4s v5, v0, v1
+; CHECK-NEXT:    cmgt.4s v6, v2, v1
+; CHECK-NEXT:    cmgt.4s v7, v3, v1
+; CHECK-NEXT:    cmgt.4s v1, v4, v1
+; CHECK-NEXT:    and.16b v3, v3, v7
+; CHECK-NEXT:    and.16b v1, v4, v1
+; CHECK-NEXT:    and.16b v2, v2, v6
+; CHECK-NEXT:    and.16b v0, v0, v5
 ; CHECK-NEXT:    ret
 entry:
   %ext = sext <16 x i8> %a to <16 x i32>
@@ -441,14 +413,13 @@ entry:
 define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) {
 ; CHECK-LABEL: same_sext_used_in_cmp_eq_and_select_v8i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.8h v1, #10
-; CHECK-NEXT:    ext.16b v2, v0, v0, #8
-; CHECK-NEXT:    cmeq.8h v1, v0, v1
-; CHECK-NEXT:    ext.16b v3, v1, v1, #8
-; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    movi.4s v1, #10
+; CHECK-NEXT:    sshll2.4s v2, v0, #0
 ; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    and.8b v1, v2, v3
-; CHECK-NEXT:    sshll.4s v1, v1, #0
+; CHECK-NEXT:    cmeq.4s v3, v2, v1
+; CHECK-NEXT:    cmeq.4s v4, v0, v1
+; CHECK-NEXT:    and.16b v1, v2, v3
+; CHECK-NEXT:    and.16b v0, v0, v4
 ; CHECK-NEXT:    ret
   %ext = sext <8 x i16> %a to <8 x i32>
   %cmp = icmp eq <8 x i16> %a, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
@@ -459,24 +430,17 @@ define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32(<8 x i16> %a) {
 define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13> %a) {
 ; CHECK-LABEL: same_sext_used_in_cmp_eq_and_select_v8i32_from_v8i13:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.8h v1, #10
-; CHECK-NEXT:    mov.16b v2, v0
-; CHECK-NEXT:    bic.8h v2, #224, lsl #8
-; CHECK-NEXT:    ushll2.4s v3, v0, #0
-; CHECK-NEXT:    cmeq.8h v1, v2, v1
+; CHECK-NEXT:    ushll2.4s v2, v0, #0
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    ushll.4s v2, v1, #0
-; CHECK-NEXT:    ushll2.4s v1, v1, #0
-; CHECK-NEXT:    shl.4s v3, v3, #19
-; CHECK-NEXT:    shl.4s v1, v1, #19
-; CHECK-NEXT:    shl.4s v0, v0, #19
+; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    shl.4s v2, v2, #19
-; CHECK-NEXT:    sshr.4s v3, v3, #19
-; CHECK-NEXT:    sshr.4s v1, v1, #19
-; CHECK-NEXT:    sshr.4s v0, v0, #19
+; CHECK-NEXT:    shl.4s v0, v0, #19
 ; CHECK-NEXT:    sshr.4s v2, v2, #19
-; CHECK-NEXT:    and.16b v1, v3, v1
-; CHECK-NEXT:    and.16b v0, v0, v2
+; CHECK-NEXT:    sshr.4s v0, v0, #19
+; CHECK-NEXT:    cmeq.4s v3, v2, v1
+; CHECK-NEXT:    cmeq.4s v4, v0, v1
+; CHECK-NEXT:    and.16b v1, v2, v3
+; CHECK-NEXT:    and.16b v0, v0, v4
 ; CHECK-NEXT:    ret
   %ext = sext <8 x i13> %a to <8 x i32>
   %cmp = icmp eq <8 x i13> %a, <i13 10, i13 10, i13 10, i13 10, i13 10, i13 10, i13 10, i13 10>
@@ -487,25 +451,21 @@ define <8 x i32> @same_sext_used_in_cmp_eq_and_select_v8i32_from_v8i13(<8 x i13>
 define <16 x i32> @same_sext_used_in_cmp_ne_and_select_v8i32(<16 x i8> %a) {
 ; CHECK-LABEL: same_sext_used_in_cmp_ne_and_select_v8i32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.16b v1, #10
-; CHECK-NEXT:    sshll.8h v3, v0, #0
+; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    sshll2.8h v2, v0, #0
-; CHECK-NEXT:    cmeq.16b v0, v0, v1
-; CHECK-NEXT:    ext.16b v1, v3, v3, #8
-; CHECK-NEXT:    mvn.16b v0, v0
-; CHECK-NEXT:    sshll.8h v5, v0, #0
-; CHECK-NEXT:    sshll2.8h v0, v0, #0
-; CHECK-NEXT:    ext.16b v4, v2, v2, #8
-; CHECK-NEXT:    ext.16b v6, v5, v5, #8
-; CHECK-NEXT:    ext.16b v7, v0, v0, #8
-; CHECK-NEXT:    and.8b v0, v2, v0
-; CHECK-NEXT:    sshll.4s v2, v0, #0
-; CHECK-NEXT:    and.8b v0, v3, v5
-; CHECK-NEXT:    and.8b v1, v1, v6
-; CHECK-NEXT:    and.8b v3, v4, v7
+; CHECK-NEXT:    sshll.8h v0, v0, #0
+; CHECK-NEXT:    sshll2.4s v3, v2, #0
+; CHECK-NEXT:    sshll2.4s v4, v0, #0
 ; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    sshll.4s v1, v1, #0
-; CHECK-NEXT:    sshll.4s v3, v3, #0
+; CHECK-NEXT:    sshll.4s v2, v2, #0
+; CHECK-NEXT:    cmeq.4s v5, v0, v1
+; CHECK-NEXT:    cmeq.4s v6, v2, v1
+; CHECK-NEXT:    cmeq.4s v7, v3, v1
+; CHECK-NEXT:    cmeq.4s v1, v4, v1
+; CHECK-NEXT:    bic.16b v3, v3, v7
+; CHECK-NEXT:    bic.16b v1, v4, v1
+; CHECK-NEXT:    bic.16b v2, v2, v6
+; CHECK-NEXT:    bic.16b v0, v0, v5
 ; CHECK-NEXT:    ret
   %ext = sext <16 x i8> %a to <16 x i32>
   %cmp = icmp ne <16 x i8> %a, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10>
@@ -516,14 +476,13 @@ define <16 x i32> @same_sext_used_in_cmp_ne_and_select_v8i32(<16 x i8> %a) {
 define <8 x i32> @same_sext_used_in_cmp_signed_pred_and_select_v8i32(<8 x i16> %a) {
 ; CHECK-LABEL: same_sext_used_in_cmp_signed_pred_and_select_v8i32:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    movi.8h v1, #10
-; CHECK-NEXT:    ext.16b v2, v0, v0, #8
-; CHECK-NEXT:    cmgt.8h v1, v0, v1
-; CHECK-NEXT:    ext.16b v3, v1, v1, #8
-; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    movi.4s v1, #10
+; CHECK-NEXT:    sshll2.4s v2, v0, #0
 ; CHECK-NEXT:    sshll.4s v0, v0, #0
-; CHECK-NEXT:    and.8b v1, v2, v3
-; CHECK-NEXT:    sshll.4s v1, v1, #0
+; CHECK-NEXT:    cmgt.4s v3, v2, v1
+; CHECK-NEXT:    cmgt.4s v4, v0, v1
+; CHECK-NEXT:    and.16b v1, v2, v3
+; CHECK-NEXT:    and.16b v0, v0, v4
 ; CHECK-NEXT:    ret
 entry:
   %ext = sext <8 x i16> %a to <8 x i32>
@@ -535,24 +494,17 @@ entry:
 define <8 x i32> @same_sext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15(<8 x i15> %a) {
 ; CHECK-LABEL: same_sext_used_in_cmp_unsigned_pred_and_select_v8i32_from_v8i15:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.8h v1, #10
-; CHECK-NEXT:    shl.8h v2, v0, #1
-; CHECK-NEXT:    ushll2.4s v3, v0, #0
-; CHECK-NEXT:    sshr.8h v2, v2, #1
+; CHECK-NEXT:    ushll2.4s v2, v0, #0
 ; CHECK-NEXT:    ushll.4s v0, v0, #0
-; CHECK-NEXT:    shl.4s v3, v3, #17
-; CHECK-NEXT:    cmge.8h v1, v2, v1
-; CHECK-NEXT:    shl.4s v0, v0, #17
-; CHECK-NEXT:    ushll.4s v2, v1, #0
-; CHECK-NEXT:    sshr.4s v3, v3, #17
-; CHECK-NEXT:    ushll2.4s v1, v1, #0
+; CHECK-NEXT:    movi.4s v1, #10
 ; CHECK-NEXT:    shl.4s v2, v2, #17
-; CHECK-NEXT:    shl.4s v1, v1, #17
-; CHECK-NEXT:    sshr.4s v0, v0, #17
+; CHECK-NEXT:    shl.4s v0, v0, #17
 ; CHECK-NEXT:    sshr.4s v2, v2, #17
-; CHECK-NEXT:    sshr.4s v1, v1, #17
-; CHECK-NEXT:    and.16b v0, v0, v2
-; CHECK-NEXT:    and.16b v1, v3, v1
+; CHECK-NEXT:    sshr.4s v0, v0, #17
+; CHECK-NEXT:    cmge.4s v3, v2, v1
+; CHECK-NEXT:    cmge.4s v4, v0, v1
+; CHECK-NEXT:    and.16b v1, v2, v3
+; CHECK-NEXT:    and.16b v0, v0, v4
 ; CHECK-NEXT:    ret
   %ext = sext <8 x i15> %a to <8 x i32>
   %cmp = icmp sge <8 x i15> %a, <i15 10, i15 10, i15 10, i15 10, i15 10, i15 10, i15 10, i15 10>


        


More information about the llvm-commits mailing list