[llvm] 63a4638 - Recommit [AArch64] Optimize memcmp when the result is tested for [in]equality with 0

Sat Oct 29 11:05:01 PDT 2022

Author: zhongyunde
Date: 2022-10-30T02:04:02+08:00
New Revision: 63a46385f2c6dd39cf68d9811548c53e8d460cd9

URL: https://github.com/llvm/llvm-project/commit/63a46385f2c6dd39cf68d9811548c53e8d460cd9
DIFF: https://github.com/llvm/llvm-project/commit/63a46385f2c6dd39cf68d9811548c53e8d460cd9.diff

LOG: Recommit [AArch64] Optimize memcmp when the result is tested for [in]equality with 0

Fixes 1st issue of https://github.com/llvm/llvm-project/issues/58061
Fixes the crash of https://github.com/llvm/llvm-project/issues/58675

Reviewed By: dmgreen, efriedma
Differential Revision: https://reviews.llvm.org/D136244

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
    llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
    llvm/test/CodeGen/AArch64/bcmp.ll
    llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
    llvm/test/CodeGen/AArch64/i128-cmp.ll
    llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3194f54aab702..a97a24c435510 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19490,6 +19490,35 @@ static SDValue performSETCCCombine(SDNode *N,
     }
   }
 
+  // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
+  // cmp A0, A0; ccmp A0, B1, 0, eq; cmp inv(Cond) flag
+  if (!DCI.isBeforeLegalize() && VT.isScalarInteger() &&
+      (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
+      LHS->getOpcode() == ISD::OR &&
+      (LHS.getOperand(0)->getOpcode() == ISD::XOR &&
+       LHS.getOperand(1)->getOpcode() == ISD::XOR) &&
+      LHS.hasOneUse() && LHS.getOperand(0)->hasOneUse() &&
+      LHS.getOperand(1)->hasOneUse()) {
+    SDValue XOR0 = LHS.getOperand(0);
+    SDValue XOR1 = LHS.getOperand(1);
+    SDValue CCVal = DAG.getConstant(AArch64CC::EQ, DL, MVT_CC);
+    EVT TstVT = LHS->getValueType(0);
+    SDValue Cmp =
+        DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(TstVT, MVT::i32),
+                    XOR0.getOperand(0), XOR0.getOperand(1));
+    SDValue Overflow = Cmp.getValue(1);
+    SDValue NZCVOp = DAG.getConstant(0, DL, MVT::i32);
+    SDValue CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, XOR1.getOperand(0),
+                               XOR1.getOperand(1), NZCVOp, CCVal, Overflow);
+    // Invert CSEL's operands.
+    SDValue TVal = DAG.getConstant(1, DL, VT);
+    SDValue FVal = DAG.getConstant(0, DL, VT);
+    AArch64CC::CondCode CC = changeIntCCToAArch64CC(Cond);
+    AArch64CC::CondCode InvCC = AArch64CC::getInvertedCondCode(CC);
+    return DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal,
+                       DAG.getConstant(InvCC, DL, MVT::i32), CCmp);
+  }
+
   return SDValue();
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
index d16c8aaff1899..ec5f8e2524994 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
@@ -216,38 +216,40 @@ define i128 @test_rmw_add_128(i128* %dst)   {
 ; NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; NOLSE-NEXT:    // Child Loop BB4_2 Depth 2
 ; NOLSE-NEXT:    ldr x11, [sp, #40] // 8-byte Folded Reload
-; NOLSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
-; NOLSE-NEXT:    ldr x13, [sp, #24] // 8-byte Folded Reload
-; NOLSE-NEXT:    adds x14, x8, #1
+; NOLSE-NEXT:    ldr x13, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x10, [sp, #24] // 8-byte Folded Reload
+; NOLSE-NEXT:    adds x14, x13, #1
 ; NOLSE-NEXT:    cinc x15, x11, hs
 ; NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
 ; NOLSE-NEXT:    // Parent Loop BB4_1 Depth=1
 ; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxp x10, x9, [x13]
-; NOLSE-NEXT:    cmp x10, x8
-; NOLSE-NEXT:    cset w12, ne
-; NOLSE-NEXT:    cmp x9, x11
-; NOLSE-NEXT:    cinc w12, w12, ne
-; NOLSE-NEXT:    cbnz w12, .LBB4_4
+; NOLSE-NEXT:    ldaxp x12, x8, [x10]
+; NOLSE-NEXT:    cmp x12, x13
+; NOLSE-NEXT:    cset w9, ne
+; NOLSE-NEXT:    cmp x8, x11
+; NOLSE-NEXT:    cinc w9, w9, ne
+; NOLSE-NEXT:    cbnz w9, .LBB4_4
 ; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
 ; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=2
-; NOLSE-NEXT:    stlxp w12, x14, x15, [x13]
-; NOLSE-NEXT:    cbnz w12, .LBB4_2
+; NOLSE-NEXT:    stlxp w9, x14, x15, [x10]
+; NOLSE-NEXT:    cbnz w9, .LBB4_2
 ; NOLSE-NEXT:    b .LBB4_5
 ; NOLSE-NEXT:  .LBB4_4: // %atomicrmw.start
 ; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=2
-; NOLSE-NEXT:    stlxp w12, x10, x9, [x13]
-; NOLSE-NEXT:    cbnz w12, .LBB4_2
+; NOLSE-NEXT:    stlxp w9, x12, x8, [x10]
+; NOLSE-NEXT:    cbnz w9, .LBB4_2
 ; NOLSE-NEXT:  .LBB4_5: // %atomicrmw.start
 ; NOLSE-NEXT:    // in Loop: Header=BB4_1 Depth=1
-; NOLSE-NEXT:    eor x11, x9, x11
-; NOLSE-NEXT:    eor x8, x10, x8
-; NOLSE-NEXT:    orr x8, x8, x11
+; NOLSE-NEXT:    mov x9, x8
 ; NOLSE-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
+; NOLSE-NEXT:    mov x10, x12
 ; NOLSE-NEXT:    str x10, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    subs x12, x12, x13
+; NOLSE-NEXT:    ccmp x8, x11, #0, eq
+; NOLSE-NEXT:    cset w8, ne
 ; NOLSE-NEXT:    str x10, [sp, #32] // 8-byte Folded Spill
 ; NOLSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; NOLSE-NEXT:    cbnz x8, .LBB4_1
+; NOLSE-NEXT:    tbnz w8, #0, .LBB4_1
 ; NOLSE-NEXT:    b .LBB4_6
 ; NOLSE-NEXT:  .LBB4_6: // %atomicrmw.end
 ; NOLSE-NEXT:    ldr x1, [sp, #8] // 8-byte Folded Reload
@@ -267,26 +269,26 @@ define i128 @test_rmw_add_128(i128* %dst)   {
 ; LSE-NEXT:    b .LBB4_1
 ; LSE-NEXT:  .LBB4_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    ldr x10, [sp, #40] // 8-byte Folded Reload
-; LSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x8, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x11, [sp, #32] // 8-byte Folded Reload
 ; LSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
-; LSE-NEXT:    mov x0, x8
-; LSE-NEXT:    mov x1, x10
-; LSE-NEXT:    adds x2, x8, #1
-; LSE-NEXT:    cinc x11, x10, hs
+; LSE-NEXT:    mov x0, x11
+; LSE-NEXT:    mov x1, x8
+; LSE-NEXT:    adds x2, x11, #1
+; LSE-NEXT:    cinc x10, x8, hs
 ; LSE-NEXT:    // kill: def $x2 killed $x2 def $x2_x3
-; LSE-NEXT:    mov x3, x11
+; LSE-NEXT:    mov x3, x10
 ; LSE-NEXT:    caspal x0, x1, x2, x3, [x9]
 ; LSE-NEXT:    mov x9, x1
 ; LSE-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
-; LSE-NEXT:    eor x11, x9, x10
 ; LSE-NEXT:    mov x10, x0
 ; LSE-NEXT:    str x10, [sp, #16] // 8-byte Folded Spill
-; LSE-NEXT:    eor x8, x10, x8
-; LSE-NEXT:    orr x8, x8, x11
+; LSE-NEXT:    subs x11, x10, x11
+; LSE-NEXT:    ccmp x9, x8, #0, eq
+; LSE-NEXT:    cset w8, ne
 ; LSE-NEXT:    str x10, [sp, #32] // 8-byte Folded Spill
 ; LSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; LSE-NEXT:    cbnz x8, .LBB4_1
+; LSE-NEXT:    tbnz w8, #0, .LBB4_1
 ; LSE-NEXT:    b .LBB4_2
 ; LSE-NEXT:  .LBB4_2: // %atomicrmw.end
 ; LSE-NEXT:    ldr x1, [sp, #8] // 8-byte Folded Reload
@@ -606,42 +608,44 @@ define i128 @test_rmw_nand_128(i128* %dst)   {
 ; NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; NOLSE-NEXT:    // Child Loop BB9_2 Depth 2
 ; NOLSE-NEXT:    ldr x11, [sp, #40] // 8-byte Folded Reload
-; NOLSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
-; NOLSE-NEXT:    ldr x13, [sp, #24] // 8-byte Folded Reload
-; NOLSE-NEXT:    mov w9, w8
-; NOLSE-NEXT:    mvn w10, w9
-; NOLSE-NEXT:    // implicit-def: $x9
-; NOLSE-NEXT:    mov w9, w10
-; NOLSE-NEXT:    orr x14, x9, #0xfffffffffffffffe
+; NOLSE-NEXT:    ldr x13, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT:    ldr x10, [sp, #24] // 8-byte Folded Reload
+; NOLSE-NEXT:    mov w8, w13
+; NOLSE-NEXT:    mvn w9, w8
+; NOLSE-NEXT:    // implicit-def: $x8
+; NOLSE-NEXT:    mov w8, w9
+; NOLSE-NEXT:    orr x14, x8, #0xfffffffffffffffe
 ; NOLSE-NEXT:    mov x15, #-1
 ; NOLSE-NEXT:  .LBB9_2: // %atomicrmw.start
 ; NOLSE-NEXT:    // Parent Loop BB9_1 Depth=1
 ; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxp x10, x9, [x13]
-; NOLSE-NEXT:    cmp x10, x8
-; NOLSE-NEXT:    cset w12, ne
-; NOLSE-NEXT:    cmp x9, x11
-; NOLSE-NEXT:    cinc w12, w12, ne
-; NOLSE-NEXT:    cbnz w12, .LBB9_4
+; NOLSE-NEXT:    ldaxp x12, x8, [x10]
+; NOLSE-NEXT:    cmp x12, x13
+; NOLSE-NEXT:    cset w9, ne
+; NOLSE-NEXT:    cmp x8, x11
+; NOLSE-NEXT:    cinc w9, w9, ne
+; NOLSE-NEXT:    cbnz w9, .LBB9_4
 ; NOLSE-NEXT:  // %bb.3: // %atomicrmw.start
 ; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=2
-; NOLSE-NEXT:    stlxp w12, x14, x15, [x13]
-; NOLSE-NEXT:    cbnz w12, .LBB9_2
+; NOLSE-NEXT:    stlxp w9, x14, x15, [x10]
+; NOLSE-NEXT:    cbnz w9, .LBB9_2
 ; NOLSE-NEXT:    b .LBB9_5
 ; NOLSE-NEXT:  .LBB9_4: // %atomicrmw.start
 ; NOLSE-NEXT:    // in Loop: Header=BB9_2 Depth=2
-; NOLSE-NEXT:    stlxp w12, x10, x9, [x13]
-; NOLSE-NEXT:    cbnz w12, .LBB9_2
+; NOLSE-NEXT:    stlxp w9, x12, x8, [x10]
+; NOLSE-NEXT:    cbnz w9, .LBB9_2
 ; NOLSE-NEXT:  .LBB9_5: // %atomicrmw.start
 ; NOLSE-NEXT:    // in Loop: Header=BB9_1 Depth=1
-; NOLSE-NEXT:    eor x11, x9, x11
-; NOLSE-NEXT:    eor x8, x10, x8
-; NOLSE-NEXT:    orr x8, x8, x11
+; NOLSE-NEXT:    mov x9, x8
 ; NOLSE-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
+; NOLSE-NEXT:    mov x10, x12
 ; NOLSE-NEXT:    str x10, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT:    subs x12, x12, x13
+; NOLSE-NEXT:    ccmp x8, x11, #0, eq
+; NOLSE-NEXT:    cset w8, ne
 ; NOLSE-NEXT:    str x10, [sp, #32] // 8-byte Folded Spill
 ; NOLSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; NOLSE-NEXT:    cbnz x8, .LBB9_1
+; NOLSE-NEXT:    tbnz w8, #0, .LBB9_1
 ; NOLSE-NEXT:    b .LBB9_6
 ; NOLSE-NEXT:  .LBB9_6: // %atomicrmw.end
 ; NOLSE-NEXT:    ldr x1, [sp, #8] // 8-byte Folded Reload
@@ -661,30 +665,30 @@ define i128 @test_rmw_nand_128(i128* %dst)   {
 ; LSE-NEXT:    b .LBB9_1
 ; LSE-NEXT:  .LBB9_1: // %atomicrmw.start
 ; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
-; LSE-NEXT:    ldr x10, [sp, #40] // 8-byte Folded Reload
-; LSE-NEXT:    ldr x8, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x8, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT:    ldr x11, [sp, #32] // 8-byte Folded Reload
 ; LSE-NEXT:    ldr x9, [sp, #24] // 8-byte Folded Reload
-; LSE-NEXT:    mov x0, x8
-; LSE-NEXT:    mov x1, x10
-; LSE-NEXT:    mov w11, w8
-; LSE-NEXT:    mvn w12, w11
-; LSE-NEXT:    // implicit-def: $x11
-; LSE-NEXT:    mov w11, w12
-; LSE-NEXT:    orr x2, x11, #0xfffffffffffffffe
-; LSE-NEXT:    mov x11, #-1
+; LSE-NEXT:    mov x0, x11
+; LSE-NEXT:    mov x1, x8
+; LSE-NEXT:    mov w10, w11
+; LSE-NEXT:    mvn w12, w10
+; LSE-NEXT:    // implicit-def: $x10
+; LSE-NEXT:    mov w10, w12
+; LSE-NEXT:    orr x2, x10, #0xfffffffffffffffe
+; LSE-NEXT:    mov x10, #-1
 ; LSE-NEXT:    // kill: def $x2 killed $x2 def $x2_x3
-; LSE-NEXT:    mov x3, x11
+; LSE-NEXT:    mov x3, x10
 ; LSE-NEXT:    caspal x0, x1, x2, x3, [x9]
 ; LSE-NEXT:    mov x9, x1
 ; LSE-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
-; LSE-NEXT:    eor x11, x9, x10
 ; LSE-NEXT:    mov x10, x0
 ; LSE-NEXT:    str x10, [sp, #16] // 8-byte Folded Spill
-; LSE-NEXT:    eor x8, x10, x8
-; LSE-NEXT:    orr x8, x8, x11
+; LSE-NEXT:    subs x11, x10, x11
+; LSE-NEXT:    ccmp x9, x8, #0, eq
+; LSE-NEXT:    cset w8, ne
 ; LSE-NEXT:    str x10, [sp, #32] // 8-byte Folded Spill
 ; LSE-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; LSE-NEXT:    cbnz x8, .LBB9_1
+; LSE-NEXT:    tbnz w8, #0, .LBB9_1
 ; LSE-NEXT:    b .LBB9_2
 ; LSE-NEXT:  .LBB9_2: // %atomicrmw.end
 ; LSE-NEXT:    ldr x1, [sp, #8] // 8-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
index 8a2429b064adc..60b0c37cb8535 100644
--- a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
+++ b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
@@ -12,10 +12,8 @@ define i1 @test_b2(i8* %s1, i8* %s2) {
 ; CHECKN-NEXT:    ldr x9, [x1]
 ; CHECKN-NEXT:    ldur x10, [x0, #7]
 ; CHECKN-NEXT:    ldur x11, [x1, #7]
-; CHECKN-NEXT:    eor x8, x8, x9
-; CHECKN-NEXT:    eor x9, x10, x11
-; CHECKN-NEXT:    orr x8, x8, x9
-; CHECKN-NEXT:    cmp x8, #0
+; CHECKN-NEXT:    cmp x8, x9
+; CHECKN-NEXT:    ccmp x10, x11, #0, eq
 ; CHECKN-NEXT:    cset w0, eq
 ; CHECKN-NEXT:    ret
 ;
@@ -44,10 +42,8 @@ define i1 @test_b2_align8(i8* align 8 %s1, i8* align 8 %s2) {
 ; CHECKN-NEXT:    ldr x9, [x1]
 ; CHECKN-NEXT:    ldur x10, [x0, #7]
 ; CHECKN-NEXT:    ldur x11, [x1, #7]
-; CHECKN-NEXT:    eor x8, x8, x9
-; CHECKN-NEXT:    eor x9, x10, x11
-; CHECKN-NEXT:    orr x8, x8, x9
-; CHECKN-NEXT:    cmp x8, #0
+; CHECKN-NEXT:    cmp x8, x9
+; CHECKN-NEXT:    ccmp x10, x11, #0, eq
 ; CHECKN-NEXT:    cset w0, eq
 ; CHECKN-NEXT:    ret
 ;

diff  --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll
index ff94642857e63..510c64ee1111a 100644
--- a/llvm/test/CodeGen/AArch64/bcmp.ll
+++ b/llvm/test/CodeGen/AArch64/bcmp.ll
@@ -113,10 +113,8 @@ define i1 @bcmp7(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr w9, [x1]
 ; CHECK-NEXT:    ldur w10, [x0, #3]
 ; CHECK-NEXT:    ldur w11, [x1, #3]
-; CHECK-NEXT:    eor w8, w8, w9
-; CHECK-NEXT:    eor w9, w10, w11
-; CHECK-NEXT:    orr w8, w8, w9
-; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    ccmp w10, w11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 7)
@@ -182,10 +180,8 @@ define i1 @bcmp11(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr x9, [x1]
 ; CHECK-NEXT:    ldur x10, [x0, #3]
 ; CHECK-NEXT:    ldur x11, [x1, #3]
-; CHECK-NEXT:    eor x8, x8, x9
-; CHECK-NEXT:    eor x9, x10, x11
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 11)
@@ -218,10 +214,8 @@ define i1 @bcmp13(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr x9, [x1]
 ; CHECK-NEXT:    ldur x10, [x0, #5]
 ; CHECK-NEXT:    ldur x11, [x1, #5]
-; CHECK-NEXT:    eor x8, x8, x9
-; CHECK-NEXT:    eor x9, x10, x11
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 13)
@@ -236,10 +230,8 @@ define i1 @bcmp14(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr x9, [x1]
 ; CHECK-NEXT:    ldur x10, [x0, #6]
 ; CHECK-NEXT:    ldur x11, [x1, #6]
-; CHECK-NEXT:    eor x8, x8, x9
-; CHECK-NEXT:    eor x9, x10, x11
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 14)
@@ -254,10 +246,8 @@ define i1 @bcmp15(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr x9, [x1]
 ; CHECK-NEXT:    ldur x10, [x0, #7]
 ; CHECK-NEXT:    ldur x11, [x1, #7]
-; CHECK-NEXT:    eor x8, x8, x9
-; CHECK-NEXT:    eor x9, x10, x11
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 15)
@@ -270,10 +260,8 @@ define i1 @bcmp16(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp x8, x9, [x0]
 ; CHECK-NEXT:    ldp x10, x11, [x1]
-; CHECK-NEXT:    eor x8, x8, x10
-; CHECK-NEXT:    eor x9, x9, x11
-; CHECK-NEXT:    orr x8, x8, x9
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %cr = call i32 @bcmp(ptr %a, ptr %b, i64 16)

diff  --git a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
index f826a80940468..40794ee1627e2 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
@@ -128,3 +128,123 @@ define i1 @combine_setcc_ne_vecreduce_or_v64i1(<64 x i8> %a) {
   %cmp2 = icmp ne i64 %cast, zeroinitializer
   ret i1 %cmp2
 }
+
+define i1 @combine_setcc_eq0_conjunction_xor_or(ptr %a, ptr %b) {
+; CHECK-LABEL: combine_setcc_eq0_conjunction_xor_or:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x9, [x0]
+; CHECK-NEXT:    ldp x10, x11, [x1]
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16)
+  %cmp = icmp eq i32 %bcmp, 0
+  ret i1 %cmp
+}
+
+define i1 @combine_setcc_ne0_conjunction_xor_or(ptr %a, ptr %b) {
+; CHECK-LABEL: combine_setcc_ne0_conjunction_xor_or:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x9, [x0]
+; CHECK-NEXT:    ldp x10, x11, [x1]
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x11, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16)
+  %cmp = icmp ne i32 %bcmp, 0
+  ret i1 %cmp
+}
+
+; Doesn't increase the number of instructions, where the LHS has multiple uses
+define i32 @combine_setcc_multiuse(i32 %0, i32 %1, i32 %2, i32 %3) {
+; CHECK-LABEL: combine_setcc_multiuse:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor w8, w1, w0
+; CHECK-NEXT:    eor w9, w3, w2
+; CHECK-NEXT:    orr w8, w9, w8
+; CHECK-NEXT:    cbz w8, .LBB10_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    b use
+; CHECK-NEXT:  .LBB10_2:
+; CHECK-NEXT:    ret
+  %5 = xor i32 %1, %0
+  %6 = xor i32 %3, %2
+  %7 = or i32 %6, %5
+  %8 = icmp eq i32 %7, 0
+  br i1 %8, label %11, label %9
+
+9:                                                ; preds = %4
+  %10 = tail call i32 @use(i32 %7) #2
+  br label %11
+
+11:                                               ; preds = %4, %9
+  %12 = phi i32 [ %10, %9 ], [ %0, %4 ]
+  ret i32 %12
+}
+
+; There may be issues with the CMP/CCMP with the scheduling of instructions
+; that ISel will create out of the DAG
+define i32 @combine_setcc_glue(i128 noundef %x, i128 noundef %y) {
+; CHECK-LABEL: combine_setcc_glue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    ccmp x1, x3, #0, eq
+; CHECK-NEXT:    cset w9, eq
+; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    ret
+entry:
+  %cmp3 = icmp eq i128 %x, %y
+  %conv = trunc i128 %x to i64
+  %conv1 = trunc i128 %y to i64
+  %cmp = icmp eq i64 %conv, %conv1
+  %or7 = or i1 %cmp3, %cmp
+  %or = zext i1 %or7 to i32
+  ret i32 %or
+}
+
+; Reduced test from https://github.com/llvm/llvm-project/issues/58675
+define [2 x i64] @PR58675(i128 %a.addr, i128 %b.addr) {
+; CHECK-LABEL: PR58675:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:  .LBB12_1: // %do.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmp x0, x8
+; CHECK-NEXT:    csel x10, x0, x8, lo
+; CHECK-NEXT:    cmp x1, x9
+; CHECK-NEXT:    csel x8, x0, x8, lo
+; CHECK-NEXT:    csel x8, x10, x8, eq
+; CHECK-NEXT:    csel x10, x1, x9, lo
+; CHECK-NEXT:    subs x8, x2, x8
+; CHECK-NEXT:    sbc x9, x3, x10
+; CHECK-NEXT:    ccmp x3, x10, #0, eq
+; CHECK-NEXT:    b.ne .LBB12_1
+; CHECK-NEXT:  // %bb.2: // %do.end
+; CHECK-NEXT:    mov x0, xzr
+; CHECK-NEXT:    mov x1, xzr
+; CHECK-NEXT:    ret
+entry:
+  br label %do.body
+
+do.body:                                      ; preds = %do.body, %entry
+  %a.addr.i1 = phi i128 [ 1, %do.body ], [ 0, %entry ]
+  %b.addr.i2 = phi i128 [ %sub, %do.body ], [ 0, %entry ]
+  %0 = tail call i128 @llvm.umin.i128(i128 %a.addr, i128 %b.addr.i2)
+  %1 = tail call i128 @llvm.umax.i128(i128 0, i128 %a.addr)
+  %sub = sub i128 %b.addr, %0
+  %cmp18.not = icmp eq i128 %b.addr, %0
+  br i1 %cmp18.not, label %do.end, label %do.body
+
+do.end:                                       ; preds = %do.body
+  ret [2 x i64] zeroinitializer
+}
+
+declare i128 @llvm.umin.i128(i128, i128)
+declare i128 @llvm.umax.i128(i128, i128)
+declare i32 @bcmp(ptr nocapture, ptr nocapture, i64)
+declare i32 @use(i32 noundef)

diff  --git a/llvm/test/CodeGen/AArch64/i128-cmp.ll b/llvm/test/CodeGen/AArch64/i128-cmp.ll
index 7cc3e843ba247..b50a559434302 100644
--- a/llvm/test/CodeGen/AArch64/i128-cmp.ll
+++ b/llvm/test/CodeGen/AArch64/i128-cmp.ll
@@ -6,10 +6,8 @@ declare void @call()
 define i1 @cmp_i128_eq(i128 %a, i128 %b) {
 ; CHECK-LABEL: cmp_i128_eq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor x8, x1, x3
-; CHECK-NEXT:    eor x9, x0, x2
-; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    ccmp x1, x3, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
     %cmp = icmp eq i128 %a, %b
@@ -19,10 +17,8 @@ define i1 @cmp_i128_eq(i128 %a, i128 %b) {
 define i1 @cmp_i128_ne(i128 %a, i128 %b) {
 ; CHECK-LABEL: cmp_i128_ne:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor x8, x1, x3
-; CHECK-NEXT:    eor x9, x0, x2
-; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    ccmp x1, x3, #0, eq
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
     %cmp = icmp ne i128 %a, %b
@@ -120,10 +116,9 @@ define i1 @cmp_i128_sle(i128 %a, i128 %b) {
 define void @br_on_cmp_i128_eq(i128 %a, i128 %b) nounwind {
 ; CHECK-LABEL: br_on_cmp_i128_eq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor x8, x1, x3
-; CHECK-NEXT:    eor x9, x0, x2
-; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cbnz x8, .LBB10_2
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    ccmp x1, x3, #0, eq
+; CHECK-NEXT:    b.ne .LBB10_2
 ; CHECK-NEXT:  // %bb.1: // %call
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    bl call
@@ -142,10 +137,9 @@ exit:
 define void @br_on_cmp_i128_ne(i128 %a, i128 %b) nounwind {
 ; CHECK-LABEL: br_on_cmp_i128_ne:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    eor x8, x1, x3
-; CHECK-NEXT:    eor x9, x0, x2
-; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cbz x8, .LBB11_2
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    ccmp x1, x3, #0, eq
+; CHECK-NEXT:    b.eq .LBB11_2
 ; CHECK-NEXT:  // %bb.1: // %call
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    bl call

diff  --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index e955014371525..e298748e8ec26 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -68,12 +68,10 @@ define i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align
 ; AARCH-NEXT:    adds x11, x12, x11
 ; AARCH-NEXT:    adc x12, x13, x14
 ; AARCH-NEXT:    adds x10, x11, x10
-; AARCH-NEXT:    adc x9, x12, x9
 ; AARCH-NEXT:    asr x11, x1, #63
-; AARCH-NEXT:    eor x9, x9, x11
-; AARCH-NEXT:    eor x10, x10, x11
-; AARCH-NEXT:    orr x9, x10, x9
-; AARCH-NEXT:    cmp x9, #0
+; AARCH-NEXT:    adc x9, x12, x9
+; AARCH-NEXT:    cmp x10, x11
+; AARCH-NEXT:    ccmp x9, x11, #0, eq
 ; AARCH-NEXT:    cset w9, ne
 ; AARCH-NEXT:    tbz x8, #63, .LBB1_2
 ; AARCH-NEXT:  // %bb.1: // %Entry