[llvm] 13db749 - [AArch64] Peephole optimization: merge AND and TST instructions

Thu Feb 27 01:28:18 PST 2020

Author: Sjoerd Meijer
Date: 2020-02-27T09:23:47Z
New Revision: 13db7490fa67e22605dec4ab824121230b0fd928

URL: https://github.com/llvm/llvm-project/commit/13db7490fa67e22605dec4ab824121230b0fd928
DIFF: https://github.com/llvm/llvm-project/commit/13db7490fa67e22605dec4ab824121230b0fd928.diff

LOG: [AArch64] Peephole optimization: merge AND and TST instructions

In some cases Clang does not perform merging of instructions AND and TST (aka
ANDS xzr).

Example:

  tst x2, x1
  and x3, x2, x1

to:

  ands x3, x2, x1

This patch add such merging during instruction selection: when AND is replaced
with ANDS instruction in LowerSELECT_CC, all users of AND also should be
changed for using this ANDS instruction

Short discussion on mailing list:
http://llvm.1065342.n5.nabble.com/llvm-dev-ARM-Peephole-optimization-instructions-tst-add-tp133109.html

Patch by Pavel Kosov.

Differential Revision: https://reviews.llvm.org/D71701

Added: 
    llvm/test/CodeGen/AArch64/peephole-and-tst.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/funnel-shift.ll
    llvm/test/CodeGen/AArch64/shift-by-signext.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 43ddd975a9cb..376688825c06 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2702,7 +2702,8 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
     // bits that are implicitly ANDed off by the above opcodes and if so, skip
     // the AND.
     uint64_t MaskImm;
-    if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm))
+    if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) &&
+        !isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm))
       return false;
 
     if (countTrailingOnes(MaskImm) < Bits)

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6f2c2bc2817c..a3305db99558 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1754,14 +1754,22 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
     Opcode = AArch64ISD::ADDS;
     LHS = LHS.getOperand(1);
-  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
-             !isUnsignedIntSetCC(CC)) {
-    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
-    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
-    // of the signed comparisons.
-    Opcode = AArch64ISD::ANDS;
-    RHS = LHS.getOperand(1);
-    LHS = LHS.getOperand(0);
+  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
+    if (LHS.getOpcode() == ISD::AND) {
+      // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+      // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+      // of the signed comparisons.
+      const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
+                                           DAG.getVTList(VT, MVT_CC),
+                                           LHS.getOperand(0),
+                                           LHS.getOperand(1));
+      // Replace all users of (and X, Y) with newly generated (ands X, Y)
+      DAG.ReplaceAllUsesWith(LHS, ANDSNode);
+      return ANDSNode.getValue(1);
+    } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
+      // Use result of ANDS
+      return LHS.getValue(1);
+    }
   }
 
   return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)

diff  --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
index 67ca729894e5..98815fe69559 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -18,12 +18,11 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: fshl_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w2, #0x1f
+; CHECK-NEXT:    ands w9, w2, #0x1f
 ; CHECK-NEXT:    neg w9, w9
 ; CHECK-NEXT:    lsl w8, w0, w2
 ; CHECK-NEXT:    lsr w9, w1, w9
 ; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    tst w2, #0x1f
 ; CHECK-NEXT:    csel w0, w0, w8, eq
 ; CHECK-NEXT:    ret
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
@@ -146,12 +145,11 @@ define i8 @fshl_i8_const_fold() {
 define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: fshr_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w2, #0x1f
+; CHECK-NEXT:    ands w9, w2, #0x1f
 ; CHECK-NEXT:    neg w9, w9
 ; CHECK-NEXT:    lsr w8, w1, w2
 ; CHECK-NEXT:    lsl w9, w0, w9
 ; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    tst w2, #0x1f
 ; CHECK-NEXT:    csel w0, w1, w8, eq
 ; CHECK-NEXT:    ret
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)

diff  --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
new file mode 100644
index 000000000000..4feee1f4fdfb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -0,0 +1,81 @@
+; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+
+%struct.anon = type { i32*, i32* }
+
+ at ptr_wrapper = common dso_local local_unnamed_addr global %struct.anon* null, align 8
+
+define dso_local i32 @test_func_i32_two_uses(i32 %in, i32 %bit, i32 %mask) local_unnamed_addr {
+entry:
+  %0 = load %struct.anon*, %struct.anon** @ptr_wrapper, align 8
+  %result = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1
+  %tobool2 = icmp ne i32 %mask, 0
+  br label %do.body
+
+do.body:                                          ; preds = %4, %entry
+; CHECK-LABEL: test_func_i32_two_uses:
+; CHECK: ands [[DSTREG:w[0-9]+]]
+; Usage #1
+; CHECK: cmp [[DSTREG]]
+; Usage #2
+; CHECK: cbz [[DSTREG]]
+  %bit.addr.0 = phi i32 [ %bit, %entry ], [ %shl, %4 ]
+  %retval1.0 = phi i32 [ 0, %entry ], [ %retval1.1, %4 ]
+  %and = and i32 %bit.addr.0, %in
+  %tobool = icmp eq i32 %and, 0
+  %not.tobool = xor i1 %tobool, true
+  %inc = zext i1 %not.tobool to i32
+  %retval1.1 = add nuw nsw i32 %retval1.0, %inc
+  %1 = xor i1 %tobool, true
+  %2 = or i1 %tobool2, %1
+  %dummy = and i32 %mask, %in
+  %use_and = icmp eq i32 %and, %dummy
+  %dummy_or = or i1 %use_and, %2
+  br i1 %dummy_or, label %3, label %4
+
+3:                                                ; preds = %do.body
+  store i32* null, i32** %result, align 8
+  br label %4
+
+4:                                                ; preds = %do.body, %3
+  %shl = shl i32 %bit.addr.0, 1
+  %tobool6 = icmp eq i32 %shl, 0
+  br i1 %tobool6, label %do.end, label %do.body
+
+do.end:                                           ; preds = %4
+  ret i32 %retval1.1
+}
+
+define dso_local i32 @test_func_i64_one_use(i64 %in, i64 %bit, i64 %mask) local_unnamed_addr #0 {
+entry:
+  %0 = load %struct.anon*, %struct.anon** @ptr_wrapper, align 8
+  %result = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1
+  %tobool2 = icmp ne i64 %mask, 0
+  br label %do.body
+
+do.body:                                          ; preds = %4, %entry
+; CHECK-LABEL: test_func_i64_one_use:
+; CHECK: ands [[DSTREG:x[0-9]+]], [[SRCREG1:x[0-9]+]], [[SRCREG2:x[0-9]+]]
+; CHECK-NEXT: orr [[DSTREG]], [[SRCREG_ORR:x[0-9]+]], [[DSTREG]]
+  %bit.addr.0 = phi i64 [ %bit, %entry ], [ %shl, %4 ]
+  %retval1.0 = phi i32 [ 0, %entry ], [ %retval1.1, %4 ]
+  %and = and i64 %bit.addr.0, %in
+  %tobool = icmp eq i64 %and, 0
+  %not.tobool = xor i1 %tobool, true
+  %inc = zext i1 %not.tobool to i32
+  %retval1.1 = add nuw nsw i32 %retval1.0, %inc
+  %1 = xor i1 %tobool, true
+  %2 = or i1 %tobool2, %1
+  br i1 %2, label %3, label %4
+
+3:                                                ; preds = %do.body
+  store i32* null, i32** %result, align 8
+  br label %4
+
+4:                                                ; preds = %do.body, %3
+  %shl = shl i64 %bit.addr.0, 1
+  %tobool6 = icmp eq i64 %shl, 0
+  br i1 %tobool6, label %do.end, label %do.body
+
+do.end:                                           ; preds = %4
+  ret i32 %retval1.1
+}

diff  --git a/llvm/test/CodeGen/AArch64/shift-by-signext.ll b/llvm/test/CodeGen/AArch64/shift-by-signext.ll
index 8e6a4d2500c5..2c2abe8e7bc7 100644
--- a/llvm/test/CodeGen/AArch64/shift-by-signext.ll
+++ b/llvm/test/CodeGen/AArch64/shift-by-signext.ll
@@ -80,12 +80,11 @@ declare i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
 define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
 ; CHECK-LABEL: n6_fshl:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w2, #0x1f
+; CHECK-NEXT:    ands w9, w2, #0x1f
 ; CHECK-NEXT:    neg w9, w9
 ; CHECK-NEXT:    lsl w8, w0, w2
 ; CHECK-NEXT:    lsr w9, w1, w9
 ; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    tst w2, #0x1f
 ; CHECK-NEXT:    csel w0, w0, w8, eq
 ; CHECK-NEXT:    ret
   %shamt_wide = sext i8 %shamt to i32
@@ -95,12 +94,11 @@ define i32 @n6_fshl(i32 %x, i32 %y, i8 %shamt) nounwind {
 define i32 @n7_fshr(i32 %x, i32 %y, i8 %shamt) nounwind {
 ; CHECK-LABEL: n7_fshr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w9, w2, #0x1f
+; CHECK-NEXT:    ands w9, w2, #0x1f
 ; CHECK-NEXT:    neg w9, w9
 ; CHECK-NEXT:    lsr w8, w1, w2
 ; CHECK-NEXT:    lsl w9, w0, w9
 ; CHECK-NEXT:    orr w8, w9, w8
-; CHECK-NEXT:    tst w2, #0x1f
 ; CHECK-NEXT:    csel w0, w1, w8, eq
 ; CHECK-NEXT:    ret
   %shamt_wide = sext i8 %shamt to i32