[llvm] 894f9e5 - [AArch64] Optimize more memcmp when the result is tested for [in]equality with 0

Sat Nov 12 19:06:17 PST 2022

Author: zhongyunde
Date: 2022-11-13T11:02:34+08:00
New Revision: 894f9e54f59522f57b7a106b0c2f031260964ce8

URL: https://github.com/llvm/llvm-project/commit/894f9e54f59522f57b7a106b0c2f031260964ce8
DIFF: https://github.com/llvm/llvm-project/commit/894f9e54f59522f57b7a106b0c2f031260964ce8.diff

LOG: [AArch64] Optimize more memcmp when the result is tested for [in]equality with 0

We already surpport the or (xor a, b), (xor c, d) with D136244, while it should
capture more cases than just bcmp according the comment on
https://reviews.llvm.org/D136672, so this patch try to fold continuous
comparison series.

Also add a new callsite in LowerSETCC to address some cases folded And in the
stage of `Optimized type-legalized selection`.
Depends on D136244

Reviewed By: dmgreen, bcl5980
Differential Revision: https://reviews.llvm.org/D137721

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
    llvm/test/CodeGen/AArch64/bcmp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f5ea029626d88..34f9d6a837839 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -129,6 +129,12 @@ EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
                                          "gather intrinsics"),
                                 cl::init(true));
 
+// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
+// bottleneck after this transform on high end CPU. So this max leaf node
+// limitation is guard cmp+ccmp will be profitable.
+static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
+                                 cl::desc("Maximum of xors"));
+
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -8551,6 +8557,77 @@ SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
                      DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
 }
 
+// Check whether the continuous comparison sequence.
+static bool
+isOrXorChain(SDValue N, unsigned &Num,
+             SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
+  if (Num == MaxXors)
+    return false;
+
+  // The leaf node must be XOR
+  if (N->getOpcode() == ISD::XOR) {
+    WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
+    Num++;
+    return true;
+  }
+
+  // All the non-leaf nodes must be OR.
+  if (N->getOpcode() != ISD::OR || !N->hasOneUse())
+    return false;
+
+  if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
+      isOrXorChain(N->getOperand(1), Num, WorkList))
+    return true;
+  return false;
+}
+
+// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
+static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SmallVector<std::pair<SDValue, SDValue>, 16> WorkList;
+
+  // Only handle integer compares.
+  if (N->getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
+  // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
+  unsigned NumXors = 0;
+  if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
+      LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
+      isOrXorChain(LHS, NumXors, WorkList)) {
+    SDValue CCVal = DAG.getConstant(AArch64CC::EQ, DL, MVT_CC);
+    EVT TstVT = LHS->getValueType(0);
+    SDValue XOR0, XOR1;
+    std::tie(XOR0, XOR1) = WorkList[0];
+    SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL,
+                              DAG.getVTList(TstVT, MVT::i32), XOR0, XOR1);
+    SDValue Overflow = Cmp.getValue(1);
+    SDValue CCmp;
+    for (unsigned I = 1; I < WorkList.size(); I++) {
+      std::tie(XOR0, XOR1) = WorkList[I];
+      SDValue NZCVOp = DAG.getConstant(0, DL, MVT::i32);
+      CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, XOR0, XOR1, NZCVOp,
+                         CCVal, Overflow);
+      Overflow = CCmp;
+    }
+
+    // Exit early by inverting the condition, which help reduce indentations.
+    SDValue TVal = DAG.getConstant(1, DL, VT);
+    SDValue FVal = DAG.getConstant(0, DL, VT);
+    AArch64CC::CondCode CC = changeIntCCToAArch64CC(Cond);
+    AArch64CC::CondCode InvCC = AArch64CC::getInvertedCondCode(CC);
+    return DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal,
+                       DAG.getConstant(InvCC, DL, MVT::i32), CCmp);
+  }
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   if (Op.getValueType().isVector())
@@ -8586,6 +8663,11 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
+  // Address some cases folded And in the stage of `Optimized type-legalized
+  // selection`
+  if (SDValue V = performOrXorChainCombine(Op.getNode(), DAG))
+    return V;
+
   if (LHS.getValueType().isInteger()) {
     SDValue CCVal;
     SDValue Cmp = getAArch64Cmp(
@@ -19625,34 +19707,10 @@ static SDValue performSETCCCombine(SDNode *N,
     }
   }
 
-  // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
-  // cmp A0, A0; ccmp A0, B1, 0, eq; cmp inv(Cond) flag
-  if (!DCI.isBeforeLegalize() && VT.isScalarInteger() &&
-      (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
-      LHS->getOpcode() == ISD::OR &&
-      (LHS.getOperand(0)->getOpcode() == ISD::XOR &&
-       LHS.getOperand(1)->getOpcode() == ISD::XOR) &&
-      LHS.hasOneUse() && LHS.getOperand(0)->hasOneUse() &&
-      LHS.getOperand(1)->hasOneUse()) {
-    SDValue XOR0 = LHS.getOperand(0);
-    SDValue XOR1 = LHS.getOperand(1);
-    SDValue CCVal = DAG.getConstant(AArch64CC::EQ, DL, MVT_CC);
-    EVT TstVT = LHS->getValueType(0);
-    SDValue Cmp =
-        DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(TstVT, MVT::i32),
-                    XOR0.getOperand(0), XOR0.getOperand(1));
-    SDValue Overflow = Cmp.getValue(1);
-    SDValue NZCVOp = DAG.getConstant(0, DL, MVT::i32);
-    SDValue CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, XOR1.getOperand(0),
-                               XOR1.getOperand(1), NZCVOp, CCVal, Overflow);
-    // Invert CSEL's operands.
-    SDValue TVal = DAG.getConstant(1, DL, VT);
-    SDValue FVal = DAG.getConstant(0, DL, VT);
-    AArch64CC::CondCode CC = changeIntCCToAArch64CC(Cond);
-    AArch64CC::CondCode InvCC = AArch64CC::getInvertedCondCode(CC);
-    return DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal,
-                       DAG.getConstant(InvCC, DL, MVT::i32), CCmp);
-  }
+  // Try to perform the memcmp when the result is tested for [in]equality with 0
+  if (!DCI.isBeforeLegalize())
+    if (SDValue V = performOrXorChainCombine(N, DAG))
+      return V;
 
   return SDValue();
 }

diff  --git a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
index 60b0c37cb8535..e032db265e6f2 100644
--- a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
+++ b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
@@ -70,17 +70,13 @@ define i1 @test_bs(i8* %s1, i8* %s2) optsize {
 ; CHECKN-NEXT:    ldp x8, x9, [x0]
 ; CHECKN-NEXT:    ldp x10, x11, [x1]
 ; CHECKN-NEXT:    ldr x12, [x0, #16]
-; CHECKN-NEXT:    ldr x13, [x1, #16]
-; CHECKN-NEXT:    ldur x14, [x0, #23]
-; CHECKN-NEXT:    eor x8, x8, x10
-; CHECKN-NEXT:    ldur x15, [x1, #23]
-; CHECKN-NEXT:    eor x9, x9, x11
-; CHECKN-NEXT:    eor x10, x12, x13
-; CHECKN-NEXT:    orr x8, x8, x9
-; CHECKN-NEXT:    eor x11, x14, x15
-; CHECKN-NEXT:    orr x9, x10, x11
-; CHECKN-NEXT:    orr x8, x8, x9
-; CHECKN-NEXT:    cmp x8, #0
+; CHECKN-NEXT:    cmp x8, x10
+; CHECKN-NEXT:    ldr x8, [x1, #16]
+; CHECKN-NEXT:    ccmp x9, x11, #0, eq
+; CHECKN-NEXT:    ldur x9, [x0, #23]
+; CHECKN-NEXT:    ldur x10, [x1, #23]
+; CHECKN-NEXT:    ccmp x12, x8, #0, eq
+; CHECKN-NEXT:    ccmp x9, x10, #0, eq
 ; CHECKN-NEXT:    cset w0, eq
 ; CHECKN-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll
index 510c64ee1111a..208ef4895795b 100644
--- a/llvm/test/CodeGen/AArch64/bcmp.ll
+++ b/llvm/test/CodeGen/AArch64/bcmp.ll
@@ -39,6 +39,7 @@ define i1 @bcmp2(ptr %a, ptr %b) {
   ret i1 %r
 }
 
+; or (and (xor a, b), C1), (and (xor c, d), C2)
 define i1 @bcmp3(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp3:
 ; CHECK:       // %bb.0:
@@ -46,10 +47,8 @@ define i1 @bcmp3(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldrh w9, [x1]
 ; CHECK-NEXT:    ldrb w10, [x0, #2]
 ; CHECK-NEXT:    ldrb w11, [x1, #2]
-; CHECK-NEXT:    eor w8, w8, w9
-; CHECK-NEXT:    eor w9, w10, w11
-; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    ccmp w10, w11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 3)
@@ -70,6 +69,7 @@ define i1 @bcmp4(ptr %a, ptr %b) {
   ret i1 %r
 }
 
+; or (xor a, b), (and (xor c, d), C2)
 define i1 @bcmp5(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp5:
 ; CHECK:       // %bb.0:
@@ -77,10 +77,8 @@ define i1 @bcmp5(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr w9, [x1]
 ; CHECK-NEXT:    ldrb w10, [x0, #4]
 ; CHECK-NEXT:    ldrb w11, [x1, #4]
-; CHECK-NEXT:    eor w8, w8, w9
-; CHECK-NEXT:    eor w9, w10, w11
-; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    ccmp w10, w11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 5)
@@ -88,6 +86,7 @@ define i1 @bcmp5(ptr %a, ptr %b) {
   ret i1 %r
 }
 
+; or (xor a, b), (and (xor c, d), C2)
 define i1 @bcmp6(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp6:
 ; CHECK:       // %bb.0:
@@ -95,10 +94,8 @@ define i1 @bcmp6(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr w9, [x1]
 ; CHECK-NEXT:    ldrh w10, [x0, #4]
 ; CHECK-NEXT:    ldrh w11, [x1, #4]
-; CHECK-NEXT:    eor w8, w8, w9
-; CHECK-NEXT:    eor w9, w10, w11
-; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    ccmp w10, w11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 6)
@@ -106,6 +103,7 @@ define i1 @bcmp6(ptr %a, ptr %b) {
   ret i1 %r
 }
 
+; or (xor a, b), (xor c, d)
 define i1 @bcmp7(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp7:
 ; CHECK:       // %bb.0:
@@ -135,6 +133,7 @@ define i1 @bcmp8(ptr %a, ptr %b) {
   ret i1 %r
 }
 
+; TODO: or (xor a, b), (and (xor c, d), C2)
 define i1 @bcmp9(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp9:
 ; CHECK:       // %bb.0:
@@ -295,13 +294,10 @@ define i1 @bcmp24(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldp x8, x9, [x0]
 ; CHECK-NEXT:    ldp x10, x11, [x1]
 ; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    eor x8, x8, x10
-; CHECK-NEXT:    eor x9, x9, x11
-; CHECK-NEXT:    eor x10, x12, x13
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    orr x8, x8, x10
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ldr x8, [x1, #16]
+; CHECK-NEXT:    ccmp x9, x11, #0, eq
+; CHECK-NEXT:    ccmp x12, x8, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 24)
@@ -365,20 +361,15 @@ define i1 @bcmp38(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp x8, x9, [x0]
 ; CHECK-NEXT:    ldp x10, x11, [x1]
-; CHECK-NEXT:    ldp x12, x13, [x0, #16]
-; CHECK-NEXT:    ldp x14, x15, [x1, #16]
-; CHECK-NEXT:    eor x8, x8, x10
-; CHECK-NEXT:    eor x9, x9, x11
-; CHECK-NEXT:    ldur x10, [x0, #30]
-; CHECK-NEXT:    orr x8, x8, x9
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x11, #0, eq
 ; CHECK-NEXT:    ldur x11, [x1, #30]
-; CHECK-NEXT:    eor x12, x12, x14
-; CHECK-NEXT:    eor x13, x13, x15
-; CHECK-NEXT:    orr x9, x12, x13
-; CHECK-NEXT:    eor x10, x10, x11
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    orr x8, x8, x10
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ldp x12, x10, [x1, #16]
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldur x8, [x0, #30]
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
+; CHECK-NEXT:    ccmp x8, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 38)
@@ -391,24 +382,18 @@ define i1 @bcmp45(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp x8, x9, [x0]
 ; CHECK-NEXT:    ldp x10, x11, [x1]
-; CHECK-NEXT:    ldp x12, x13, [x0, #16]
-; CHECK-NEXT:    ldp x14, x15, [x1, #16]
-; CHECK-NEXT:    eor x8, x8, x10
-; CHECK-NEXT:    eor x9, x9, x11
-; CHECK-NEXT:    ldr x16, [x0, #32]
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    ldr x17, [x1, #32]
-; CHECK-NEXT:    ldur x18, [x0, #37]
-; CHECK-NEXT:    eor x10, x12, x14
-; CHECK-NEXT:    ldur x0, [x1, #37]
-; CHECK-NEXT:    eor x11, x13, x15
-; CHECK-NEXT:    eor x12, x16, x17
-; CHECK-NEXT:    orr x9, x10, x11
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    eor x13, x18, x0
-; CHECK-NEXT:    orr x10, x12, x13
-; CHECK-NEXT:    orr x8, x8, x10
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x11, #0, eq
+; CHECK-NEXT:    ldr x11, [x1, #32]
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ldp x12, x10, [x1, #16]
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
+; CHECK-NEXT:    ldur x9, [x0, #37]
+; CHECK-NEXT:    ldur x10, [x1, #37]
+; CHECK-NEXT:    ccmp x8, x11, #0, eq
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 45)
@@ -416,33 +401,31 @@ define i1 @bcmp45(ptr %a, ptr %b) {
   ret i1 %r
 }
 
+; Although the large cmp chain may be not profitable on high end CPU, we
+; believe it is better on most cpus, so perform the transform now.
+; 8 xor + 7 or + 1 cmp only need 6 cycles on a 4 width ALU port machine
+;   2 cycle for xor
+;   3 cycle for or
+;   1 cycle for cmp
 define i1 @bcmp64(ptr %a, ptr %b) {
 ; CHECK-LABEL: bcmp64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp x8, x9, [x0]
 ; CHECK-NEXT:    ldp x10, x11, [x1]
-; CHECK-NEXT:    ldp x12, x13, [x0, #16]
-; CHECK-NEXT:    ldp x14, x15, [x1, #16]
-; CHECK-NEXT:    eor x8, x8, x10
-; CHECK-NEXT:    eor x9, x9, x11
-; CHECK-NEXT:    ldp x16, x17, [x0, #32]
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    ldp x18, x2, [x1, #32]
-; CHECK-NEXT:    eor x12, x12, x14
-; CHECK-NEXT:    eor x13, x13, x15
-; CHECK-NEXT:    ldp x3, x0, [x0, #48]
-; CHECK-NEXT:    orr x9, x12, x13
-; CHECK-NEXT:    ldp x10, x11, [x1, #48]
-; CHECK-NEXT:    eor x14, x16, x18
-; CHECK-NEXT:    eor x15, x17, x2
-; CHECK-NEXT:    orr x12, x14, x15
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    eor x10, x3, x10
-; CHECK-NEXT:    eor x11, x0, x11
-; CHECK-NEXT:    orr x10, x10, x11
-; CHECK-NEXT:    orr x9, x12, x10
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x11, #0, eq
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ldp x12, x10, [x1, #16]
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x1, #32]
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
+; CHECK-NEXT:    ldp x9, x10, [x0, #32]
+; CHECK-NEXT:    ccmp x9, x8, #0, eq
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ldp x9, x10, [x0, #48]
+; CHECK-NEXT:    ldp x8, x11, [x1, #48]
+; CHECK-NEXT:    ccmp x9, x8, #0, eq
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 64)