[llvm] 1206f72 - [AArch64] Fold Mul(And(Srl(X, 15), 0x10001), 0xffff) to CMLTz

David Green via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 2 05:02:03 PDT 2022


Author: David Green
Date: 2022-08-02T13:01:59+01:00
New Revision: 1206f72e31f6d67069f1c90c4871b229923008a4

URL: https://github.com/llvm/llvm-project/commit/1206f72e31f6d67069f1c90c4871b229923008a4
DIFF: https://github.com/llvm/llvm-project/commit/1206f72e31f6d67069f1c90c4871b229923008a4.diff

LOG: [AArch64] Fold Mul(And(Srl(X, 15), 0x10001), 0xffff) to CMLTz

This folds a v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) into a v8i16
CMLTz instruction. The Srl and And extract the top bit (whether the
input is negative) and the Mul sets all values in the i16 half to all
1/0 depending on if that top bit was set. This is equivalent to a v8i16
CMLTz instruction. The same applies to other sizes with equivalent
constants.

Differential Revision: https://reviews.llvm.org/D130874

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/insert-extend.ll
    llvm/test/CodeGen/AArch64/mulcmle.ll
    llvm/test/CodeGen/AArch64/reduce-shuffle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ef2edf288149..fd845fadb375 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14268,12 +14268,49 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
                      Op1 ? Op1 : Mul->getOperand(1));
 }
 
+// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
+// Same for other types with equivalent constants.
+static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
+      VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
+    return SDValue();
+  if (N->getOperand(0).getOpcode() != ISD::AND ||
+      N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
+    return SDValue();
+
+  SDValue And = N->getOperand(0);
+  SDValue Srl = And.getOperand(0);
+
+  APInt V1, V2, V3;
+  if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
+      !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
+      !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))
+    return SDValue();
+
+  unsigned HalfSize = VT.getScalarSizeInBits() / 2;
+  if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
+      V3 != (HalfSize - 1))
+    return SDValue();
+
+  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
+                                EVT::getIntegerVT(*DAG.getContext(), HalfSize),
+                                VT.getVectorElementCount() * 2);
+
+  SDLoc DL(N);
+  SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
+  SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
+  return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
+}
+
 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
 
   if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
     return Ext;
+  if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
+    return Ext;
 
   if (DCI.isBeforeLegalizeOps())
     return SDValue();

diff  --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll
index 7cd4c6537a58..c27ed21e558e 100644
--- a/llvm/test/CodeGen/AArch64/insert-extend.ll
+++ b/llvm/test/CodeGen/AArch64/insert-extend.ll
@@ -118,58 +118,48 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca
 ; CHECK-NEXT:    zip1 v16.4s, v6.4s, v6.4s
 ; CHECK-NEXT:    sub v2.4s, v2.4s, v7.4s
 ; CHECK-NEXT:    ext v17.16b, v1.16b, v3.16b, #8
-; CHECK-NEXT:    ext v5.16b, v3.16b, v2.16b, #4
-; CHECK-NEXT:    ext v7.16b, v6.16b, v4.16b, #4
+; CHECK-NEXT:    ext v5.16b, v6.16b, v4.16b, #4
+; CHECK-NEXT:    ext v7.16b, v3.16b, v2.16b, #4
 ; CHECK-NEXT:    ext v18.16b, v0.16b, v6.16b, #4
 ; CHECK-NEXT:    trn2 v0.4s, v16.4s, v0.4s
 ; CHECK-NEXT:    ext v16.16b, v17.16b, v1.16b, #4
-; CHECK-NEXT:    zip2 v5.4s, v5.4s, v3.4s
-; CHECK-NEXT:    zip2 v7.4s, v7.4s, v6.4s
+; CHECK-NEXT:    zip2 v7.4s, v7.4s, v3.4s
+; CHECK-NEXT:    zip2 v5.4s, v5.4s, v6.4s
 ; CHECK-NEXT:    ext v18.16b, v18.16b, v18.16b, #4
 ; CHECK-NEXT:    mov v1.s[2], v3.s[1]
-; CHECK-NEXT:    ext v5.16b, v2.16b, v5.16b, #12
-; CHECK-NEXT:    ext v7.16b, v4.16b, v7.16b, #12
+; CHECK-NEXT:    uzp2 v16.4s, v17.4s, v16.4s
+; CHECK-NEXT:    ext v7.16b, v2.16b, v7.16b, #12
+; CHECK-NEXT:    ext v5.16b, v4.16b, v5.16b, #12
 ; CHECK-NEXT:    mov v2.s[2], v3.s[3]
 ; CHECK-NEXT:    mov v4.s[2], v6.s[3]
-; CHECK-NEXT:    uzp2 v16.4s, v17.4s, v16.4s
-; CHECK-NEXT:    sub v19.4s, v0.4s, v18.4s
+; CHECK-NEXT:    sub v17.4s, v0.4s, v18.4s
 ; CHECK-NEXT:    mov v18.s[0], v6.s[1]
-; CHECK-NEXT:    sub v17.4s, v2.4s, v5.4s
-; CHECK-NEXT:    sub v20.4s, v4.4s, v7.4s
-; CHECK-NEXT:    sub v21.4s, v1.4s, v16.4s
+; CHECK-NEXT:    sub v19.4s, v1.4s, v16.4s
+; CHECK-NEXT:    sub v20.4s, v2.4s, v7.4s
+; CHECK-NEXT:    sub v21.4s, v4.4s, v5.4s
+; CHECK-NEXT:    mov v1.s[1], v3.s[0]
 ; CHECK-NEXT:    mov v2.s[1], v3.s[2]
 ; CHECK-NEXT:    mov v4.s[1], v6.s[2]
-; CHECK-NEXT:    mov v1.s[1], v3.s[0]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v18.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v5.4s
-; CHECK-NEXT:    add v3.4s, v4.4s, v7.4s
 ; CHECK-NEXT:    add v1.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mov v0.d[1], v19.d[1]
-; CHECK-NEXT:    mov v1.d[1], v21.d[1]
-; CHECK-NEXT:    mov v2.d[1], v17.d[1]
-; CHECK-NEXT:    mov v3.d[1], v20.d[1]
-; CHECK-NEXT:    movi v4.8h, #1
-; CHECK-NEXT:    movi v17.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    ushr v5.4s, v2.4s, #15
-; CHECK-NEXT:    ushr v6.4s, v0.4s, #15
-; CHECK-NEXT:    ushr v7.4s, v3.4s, #15
-; CHECK-NEXT:    ushr v16.4s, v1.4s, #15
-; CHECK-NEXT:    and v6.16b, v6.16b, v4.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v4.16b
-; CHECK-NEXT:    and v7.16b, v7.16b, v4.16b
-; CHECK-NEXT:    and v4.16b, v5.16b, v4.16b
-; CHECK-NEXT:    mul v5.4s, v6.4s, v17.4s
-; CHECK-NEXT:    mul v6.4s, v16.4s, v17.4s
-; CHECK-NEXT:    mul v4.4s, v4.4s, v17.4s
-; CHECK-NEXT:    mul v7.4s, v7.4s, v17.4s
-; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v7.4s
+; CHECK-NEXT:    add v3.4s, v4.4s, v5.4s
+; CHECK-NEXT:    mov v2.d[1], v20.d[1]
+; CHECK-NEXT:    mov v3.d[1], v21.d[1]
+; CHECK-NEXT:    mov v0.d[1], v17.d[1]
+; CHECK-NEXT:    mov v1.d[1], v19.d[1]
+; CHECK-NEXT:    cmlt v4.8h, v2.8h, #0
+; CHECK-NEXT:    cmlt v5.8h, v3.8h, #0
+; CHECK-NEXT:    cmlt v6.8h, v0.8h, #0
+; CHECK-NEXT:    cmlt v7.8h, v1.8h, #0
+; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v7.4s, v1.4s
 ; CHECK-NEXT:    add v2.4s, v4.4s, v2.4s
-; CHECK-NEXT:    add v3.4s, v7.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v5.4s, v3.4s
 ; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v6.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v5.16b
+; CHECK-NEXT:    eor v3.16b, v3.16b, v5.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v7.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v6.16b
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    add v1.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s

diff  --git a/llvm/test/CodeGen/AArch64/mulcmle.ll b/llvm/test/CodeGen/AArch64/mulcmle.ll
index 3b2c09cef851..8a359dfc232f 100644
--- a/llvm/test/CodeGen/AArch64/mulcmle.ll
+++ b/llvm/test/CodeGen/AArch64/mulcmle.ll
@@ -4,13 +4,7 @@
 define <1 x i64> @v1i64(<1 x i64> %a) {
 ; CHECK-LABEL: v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.2s, #1
-; CHECK-NEXT:    ushr d0, d0, #31
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    lsl x9, x8, #32
-; CHECK-NEXT:    sub x8, x9, x8
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
 ; CHECK-NEXT:    ret
   %b = lshr <1 x i64> %a, <i64 31>
   %c = and <1 x i64> %b, <i64 4294967297>
@@ -21,17 +15,7 @@ define <1 x i64> @v1i64(<1 x i64> %a) {
 define <2 x i64> @v2i64(<2 x i64> %a) {
 ; CHECK-LABEL: v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.4s, #1
-; CHECK-NEXT:    ushr v0.2d, v0.2d, #31
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    fmov x9, d0
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    lsl x10, x9, #32
-; CHECK-NEXT:    sub x9, x10, x9
-; CHECK-NEXT:    lsl x10, x8, #32
-; CHECK-NEXT:    sub x8, x10, x8
-; CHECK-NEXT:    fmov d0, x9
-; CHECK-NEXT:    mov v0.d[1], x8
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-NEXT:    ret
   %b = lshr <2 x i64> %a, <i64 31, i64 31>
   %c = and <2 x i64> %b, <i64 4294967297, i64 4294967297>
@@ -42,11 +26,7 @@ define <2 x i64> @v2i64(<2 x i64> %a) {
 define <2 x i32> @v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.4h, #1
-; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    ushr v0.2s, v0.2s, #15
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mul v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
 ; CHECK-NEXT:    ret
   %b = lshr <2 x i32> %a, <i32 15, i32 15>
   %c = and <2 x i32> %b, <i32 65537, i32 65537>
@@ -57,11 +37,7 @@ define <2 x i32> @v2i32(<2 x i32> %a) {
 define <4 x i32> @v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    ushr v0.4s, v0.4s, #15
-; CHECK-NEXT:    movi v2.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mul v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
 ; CHECK-NEXT:    ret
   %b = lshr <4 x i32> %a, <i32 15, i32 15, i32 15, i32 15>
   %c = and <4 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537>
@@ -72,14 +48,8 @@ define <4 x i32> @v4i32(<4 x i32> %a) {
 define <8 x i32> @v8i32(<8 x i32> %a) {
 ; CHECK-LABEL: v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.8h, #1
-; CHECK-NEXT:    ushr v1.4s, v1.4s, #15
-; CHECK-NEXT:    movi v3.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    ushr v0.4s, v0.4s, #15
-; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mul v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    mul v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
 ; CHECK-NEXT:    ret
   %b = lshr <8 x i32> %a, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
   %c = and <8 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
@@ -90,11 +60,7 @@ define <8 x i32> @v8i32(<8 x i32> %a) {
 define <4 x i16> @v4i16(<4 x i16> %a) {
 ; CHECK-LABEL: v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.8b, #1
-; CHECK-NEXT:    movi d2, #0xff00ff00ff00ff
-; CHECK-NEXT:    ushr v0.4h, v0.4h, #7
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mul v0.4h, v0.4h, v2.4h
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
 ; CHECK-NEXT:    ret
   %b = lshr <4 x i16> %a, <i16 7, i16 7, i16 7, i16 7>
   %c = and <4 x i16> %b, <i16 257, i16 257, i16 257, i16 257>
@@ -105,11 +71,7 @@ define <4 x i16> @v4i16(<4 x i16> %a) {
 define <8 x i16> @v8i16(<8 x i16> %a) {
 ; CHECK-LABEL: v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.16b, #1
-; CHECK-NEXT:    ushr v0.8h, v0.8h, #7
-; CHECK-NEXT:    movi v2.2d, #0xff00ff00ff00ff
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mul v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    ret
   %b = lshr <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
   %c = and <8 x i16> %b, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>

diff  --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
index caba5a61890b..797f3724c98a 100644
--- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
@@ -93,47 +93,37 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    ext v17.16b, v3.16b, v17.16b, #12
 ; CHECK-NEXT:    mov v0.s[2], v2.s[1]
 ; CHECK-NEXT:    uzp2 v4.4s, v4.4s, v18.4s
-; CHECK-NEXT:    mov v7.s[2], v2.s[3]
 ; CHECK-NEXT:    mov v3.s[2], v5.s[3]
+; CHECK-NEXT:    mov v7.s[2], v2.s[3]
 ; CHECK-NEXT:    sub v18.4s, v1.4s, v6.4s
 ; CHECK-NEXT:    mov v6.s[0], v5.s[1]
 ; CHECK-NEXT:    sub v19.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sub v20.4s, v7.4s, v16.4s
-; CHECK-NEXT:    sub v21.4s, v3.4s, v17.4s
+; CHECK-NEXT:    sub v20.4s, v3.4s, v17.4s
+; CHECK-NEXT:    sub v21.4s, v7.4s, v16.4s
 ; CHECK-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-NEXT:    mov v7.s[1], v2.s[2]
 ; CHECK-NEXT:    mov v3.s[1], v5.s[2]
+; CHECK-NEXT:    mov v7.s[1], v2.s[2]
 ; CHECK-NEXT:    add v1.4s, v1.4s, v6.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    add v2.4s, v7.4s, v16.4s
-; CHECK-NEXT:    add v3.4s, v3.4s, v17.4s
-; CHECK-NEXT:    mov v2.d[1], v20.d[1]
-; CHECK-NEXT:    mov v3.d[1], v21.d[1]
+; CHECK-NEXT:    add v2.4s, v3.4s, v17.4s
+; CHECK-NEXT:    add v3.4s, v7.4s, v16.4s
 ; CHECK-NEXT:    mov v1.d[1], v18.d[1]
 ; CHECK-NEXT:    mov v0.d[1], v19.d[1]
-; CHECK-NEXT:    movi v4.8h, #1
-; CHECK-NEXT:    movi v17.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    ushr v5.4s, v1.4s, #15
-; CHECK-NEXT:    ushr v6.4s, v2.4s, #15
-; CHECK-NEXT:    ushr v7.4s, v0.4s, #15
-; CHECK-NEXT:    ushr v16.4s, v3.4s, #15
-; CHECK-NEXT:    and v6.16b, v6.16b, v4.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v4.16b
-; CHECK-NEXT:    and v7.16b, v7.16b, v4.16b
-; CHECK-NEXT:    and v4.16b, v5.16b, v4.16b
-; CHECK-NEXT:    mul v5.4s, v6.4s, v17.4s
-; CHECK-NEXT:    mul v6.4s, v16.4s, v17.4s
-; CHECK-NEXT:    mul v4.4s, v4.4s, v17.4s
-; CHECK-NEXT:    mul v7.4s, v7.4s, v17.4s
-; CHECK-NEXT:    add v2.4s, v5.4s, v2.4s
+; CHECK-NEXT:    mov v3.d[1], v21.d[1]
+; CHECK-NEXT:    mov v2.d[1], v20.d[1]
+; CHECK-NEXT:    cmlt v4.8h, v1.8h, #0
+; CHECK-NEXT:    cmlt v5.8h, v0.8h, #0
+; CHECK-NEXT:    cmlt v6.8h, v3.8h, #0
+; CHECK-NEXT:    cmlt v7.8h, v2.8h, #0
 ; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v2.4s, v7.4s, v2.4s
 ; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v7.4s, v0.4s
+; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-NEXT:    eor v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v7.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v5.16b
+; CHECK-NEXT:    eor v2.16b, v2.16b, v7.16b
 ; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
-; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    addv s0, v0.4s
@@ -316,50 +306,40 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    mov v5.d[1], v1.d[1]
 ; CHECK-NEXT:    mov v7.d[1], v17.d[1]
 ; CHECK-NEXT:    mov v0.d[1], v3.d[1]
-; CHECK-NEXT:    movi v1.8h, #1
-; CHECK-NEXT:    add v2.4s, v7.4s, v5.4s
-; CHECK-NEXT:    add v3.4s, v0.4s, v4.4s
-; CHECK-NEXT:    sub v5.4s, v5.4s, v7.4s
+; CHECK-NEXT:    add v1.4s, v7.4s, v5.4s
+; CHECK-NEXT:    add v2.4s, v0.4s, v4.4s
 ; CHECK-NEXT:    sub v0.4s, v4.4s, v0.4s
-; CHECK-NEXT:    ext v4.16b, v3.16b, v3.16b, #4
-; CHECK-NEXT:    zip2 v6.4s, v0.4s, v3.4s
-; CHECK-NEXT:    zip2 v7.4s, v5.4s, v2.4s
-; CHECK-NEXT:    zip1 v16.4s, v2.4s, v5.4s
-; CHECK-NEXT:    zip2 v17.4s, v2.4s, v5.4s
-; CHECK-NEXT:    ext v2.16b, v2.16b, v2.16b, #4
-; CHECK-NEXT:    add v6.4s, v6.4s, v7.4s
-; CHECK-NEXT:    zip2 v7.4s, v3.4s, v0.4s
-; CHECK-NEXT:    zip1 v3.4s, v3.4s, v0.4s
+; CHECK-NEXT:    ext v4.16b, v2.16b, v2.16b, #4
+; CHECK-NEXT:    ext v16.16b, v1.16b, v1.16b, #4
+; CHECK-NEXT:    sub v3.4s, v5.4s, v7.4s
+; CHECK-NEXT:    zip2 v5.4s, v0.4s, v2.4s
+; CHECK-NEXT:    zip1 v6.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip2 v7.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip2 v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    zip1 v17.4s, v2.4s, v0.4s
+; CHECK-NEXT:    zip2 v2.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    ext v0.16b, v4.16b, v0.16b, #8
-; CHECK-NEXT:    ext v5.16b, v2.16b, v5.16b, #8
-; CHECK-NEXT:    sub v7.4s, v17.4s, v7.4s
-; CHECK-NEXT:    sub v3.4s, v16.4s, v3.4s
+; CHECK-NEXT:    ext v3.16b, v16.16b, v3.16b, #8
+; CHECK-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-NEXT:    sub v5.4s, v6.4s, v17.4s
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v4.16b, #4
-; CHECK-NEXT:    ext v2.16b, v5.16b, v2.16b, #4
-; CHECK-NEXT:    movi v17.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    ushr v5.4s, v3.4s, #15
-; CHECK-NEXT:    ushr v4.4s, v6.4s, #15
-; CHECK-NEXT:    ushr v16.4s, v7.4s, #15
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
-; CHECK-NEXT:    mul v2.4s, v5.4s, v17.4s
-; CHECK-NEXT:    ushr v5.4s, v0.4s, #15
-; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
-; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
-; CHECK-NEXT:    mul v4.4s, v4.4s, v17.4s
-; CHECK-NEXT:    mul v16.4s, v16.4s, v17.4s
-; CHECK-NEXT:    mul v1.4s, v1.4s, v17.4s
-; CHECK-NEXT:    add v3.4s, v2.4s, v3.4s
-; CHECK-NEXT:    add v5.4s, v4.4s, v6.4s
-; CHECK-NEXT:    add v6.4s, v16.4s, v7.4s
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    eor v1.16b, v6.16b, v16.16b
-; CHECK-NEXT:    eor v3.16b, v5.16b, v4.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ext v3.16b, v3.16b, v16.16b, #4
+; CHECK-NEXT:    cmlt v6.8h, v5.8h, #0
+; CHECK-NEXT:    sub v2.4s, v7.4s, v2.4s
+; CHECK-NEXT:    add v4.4s, v6.4s, v5.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    cmlt v7.8h, v2.8h, #0
+; CHECK-NEXT:    cmlt v17.8h, v1.8h, #0
+; CHECK-NEXT:    eor v3.16b, v4.16b, v6.16b
+; CHECK-NEXT:    cmlt v4.8h, v0.8h, #0
+; CHECK-NEXT:    add v1.4s, v17.4s, v1.4s
+; CHECK-NEXT:    add v2.4s, v7.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v4.4s, v0.4s
+; CHECK-NEXT:    eor v2.16b, v2.16b, v7.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v17.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v4.16b
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
@@ -557,25 +537,15 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    ext v2.16b, v6.16b, v2.16b, #8
 ; CHECK-NEXT:    ext v0.16b, v3.16b, v0.16b, #8
 ; CHECK-NEXT:    add v1.4s, v16.4s, v1.4s
-; CHECK-NEXT:    movi v16.8h, #1
+; CHECK-NEXT:    sub v4.4s, v4.4s, v17.4s
 ; CHECK-NEXT:    ext v2.16b, v2.16b, v6.16b, #4
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v3.16b, #4
 ; CHECK-NEXT:    sub v3.4s, v5.4s, v7.4s
-; CHECK-NEXT:    sub v4.4s, v4.4s, v17.4s
-; CHECK-NEXT:    ushr v5.4s, v1.4s, #15
+; CHECK-NEXT:    cmlt v5.8h, v4.8h, #0
+; CHECK-NEXT:    cmlt v6.8h, v3.8h, #0
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ushr v6.4s, v3.4s, #15
-; CHECK-NEXT:    movi v17.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    and v2.16b, v5.16b, v16.16b
-; CHECK-NEXT:    ushr v5.4s, v4.4s, #15
-; CHECK-NEXT:    ushr v7.4s, v0.4s, #15
-; CHECK-NEXT:    and v6.16b, v6.16b, v16.16b
-; CHECK-NEXT:    and v7.16b, v7.16b, v16.16b
-; CHECK-NEXT:    and v5.16b, v5.16b, v16.16b
-; CHECK-NEXT:    mul v2.4s, v2.4s, v17.4s
-; CHECK-NEXT:    mul v6.4s, v6.4s, v17.4s
-; CHECK-NEXT:    mul v5.4s, v5.4s, v17.4s
-; CHECK-NEXT:    mul v7.4s, v7.4s, v17.4s
+; CHECK-NEXT:    cmlt v2.8h, v1.8h, #0
+; CHECK-NEXT:    cmlt v7.8h, v0.8h, #0
 ; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
 ; CHECK-NEXT:    add v4.4s, v5.4s, v4.4s


        


More information about the llvm-commits mailing list