[llvm] 63a4638 - Recommit [AArch64] Optimize memcmp when the result is tested for [in]equality with 0
via llvm-commits
llvm-commits at lists.llvm.org
Sat Oct 29 11:05:01 PDT 2022
Author: zhongyunde
Date: 2022-10-30T02:04:02+08:00
New Revision: 63a46385f2c6dd39cf68d9811548c53e8d460cd9
URL: https://github.com/llvm/llvm-project/commit/63a46385f2c6dd39cf68d9811548c53e8d460cd9
DIFF: https://github.com/llvm/llvm-project/commit/63a46385f2c6dd39cf68d9811548c53e8d460cd9.diff
LOG: Recommit [AArch64] Optimize memcmp when the result is tested for [in]equality with 0
Fixes 1st issue of https://github.com/llvm/llvm-project/issues/58061
Fixes the crash of https://github.com/llvm/llvm-project/issues/58675
Reviewed By: dmgreen, efriedma
Differential Revision: https://reviews.llvm.org/D136244
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
llvm/test/CodeGen/AArch64/bcmp.ll
llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
llvm/test/CodeGen/AArch64/i128-cmp.ll
llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3194f54aab702..a97a24c435510 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19490,6 +19490,35 @@ static SDValue performSETCCCombine(SDNode *N,
}
}
+ // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
+ // cmp A0, A0; ccmp A0, B1, 0, eq; cmp inv(Cond) flag
+ if (!DCI.isBeforeLegalize() && VT.isScalarInteger() &&
+ (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
+ LHS->getOpcode() == ISD::OR &&
+ (LHS.getOperand(0)->getOpcode() == ISD::XOR &&
+ LHS.getOperand(1)->getOpcode() == ISD::XOR) &&
+ LHS.hasOneUse() && LHS.getOperand(0)->hasOneUse() &&
+ LHS.getOperand(1)->hasOneUse()) {
+ SDValue XOR0 = LHS.getOperand(0);
+ SDValue XOR1 = LHS.getOperand(1);
+ SDValue CCVal = DAG.getConstant(AArch64CC::EQ, DL, MVT_CC);
+ EVT TstVT = LHS->getValueType(0);
+ SDValue Cmp =
+ DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(TstVT, MVT::i32),
+ XOR0.getOperand(0), XOR0.getOperand(1));
+ SDValue Overflow = Cmp.getValue(1);
+ SDValue NZCVOp = DAG.getConstant(0, DL, MVT::i32);
+ SDValue CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, XOR1.getOperand(0),
+ XOR1.getOperand(1), NZCVOp, CCVal, Overflow);
+ // Invert CSEL's operands.
+ SDValue TVal = DAG.getConstant(1, DL, VT);
+ SDValue FVal = DAG.getConstant(0, DL, VT);
+ AArch64CC::CondCode CC = changeIntCCToAArch64CC(Cond);
+ AArch64CC::CondCode InvCC = AArch64CC::getInvertedCondCode(CC);
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal,
+ DAG.getConstant(InvCC, DL, MVT::i32), CCmp);
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
index d16c8aaff1899..ec5f8e2524994 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
@@ -216,38 +216,40 @@ define i128 @test_rmw_add_128(i128* %dst) {
; NOLSE-NEXT: // =>This Loop Header: Depth=1
; NOLSE-NEXT: // Child Loop BB4_2 Depth 2
; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload
-; NOLSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload
-; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload
-; NOLSE-NEXT: adds x14, x8, #1
+; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT: ldr x10, [sp, #24] // 8-byte Folded Reload
+; NOLSE-NEXT: adds x14, x13, #1
; NOLSE-NEXT: cinc x15, x11, hs
; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start
; NOLSE-NEXT: // Parent Loop BB4_1 Depth=1
; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxp x10, x9, [x13]
-; NOLSE-NEXT: cmp x10, x8
-; NOLSE-NEXT: cset w12, ne
-; NOLSE-NEXT: cmp x9, x11
-; NOLSE-NEXT: cinc w12, w12, ne
-; NOLSE-NEXT: cbnz w12, .LBB4_4
+; NOLSE-NEXT: ldaxp x12, x8, [x10]
+; NOLSE-NEXT: cmp x12, x13
+; NOLSE-NEXT: cset w9, ne
+; NOLSE-NEXT: cmp x8, x11
+; NOLSE-NEXT: cinc w9, w9, ne
+; NOLSE-NEXT: cbnz w9, .LBB4_4
; NOLSE-NEXT: // %bb.3: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2
-; NOLSE-NEXT: stlxp w12, x14, x15, [x13]
-; NOLSE-NEXT: cbnz w12, .LBB4_2
+; NOLSE-NEXT: stlxp w9, x14, x15, [x10]
+; NOLSE-NEXT: cbnz w9, .LBB4_2
; NOLSE-NEXT: b .LBB4_5
; NOLSE-NEXT: .LBB4_4: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2
-; NOLSE-NEXT: stlxp w12, x10, x9, [x13]
-; NOLSE-NEXT: cbnz w12, .LBB4_2
+; NOLSE-NEXT: stlxp w9, x12, x8, [x10]
+; NOLSE-NEXT: cbnz w9, .LBB4_2
; NOLSE-NEXT: .LBB4_5: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1
-; NOLSE-NEXT: eor x11, x9, x11
-; NOLSE-NEXT: eor x8, x10, x8
-; NOLSE-NEXT: orr x8, x8, x11
+; NOLSE-NEXT: mov x9, x8
; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
+; NOLSE-NEXT: mov x10, x12
; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT: subs x12, x12, x13
+; NOLSE-NEXT: ccmp x8, x11, #0, eq
+; NOLSE-NEXT: cset w8, ne
; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
-; NOLSE-NEXT: cbnz x8, .LBB4_1
+; NOLSE-NEXT: tbnz w8, #0, .LBB4_1
; NOLSE-NEXT: b .LBB4_6
; NOLSE-NEXT: .LBB4_6: // %atomicrmw.end
; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
@@ -267,26 +269,26 @@ define i128 @test_rmw_add_128(i128* %dst) {
; LSE-NEXT: b .LBB4_1
; LSE-NEXT: .LBB4_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
-; LSE-NEXT: ldr x10, [sp, #40] // 8-byte Folded Reload
-; LSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
-; LSE-NEXT: mov x0, x8
-; LSE-NEXT: mov x1, x10
-; LSE-NEXT: adds x2, x8, #1
-; LSE-NEXT: cinc x11, x10, hs
+; LSE-NEXT: mov x0, x11
+; LSE-NEXT: mov x1, x8
+; LSE-NEXT: adds x2, x11, #1
+; LSE-NEXT: cinc x10, x8, hs
; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3
-; LSE-NEXT: mov x3, x11
+; LSE-NEXT: mov x3, x10
; LSE-NEXT: caspal x0, x1, x2, x3, [x9]
; LSE-NEXT: mov x9, x1
; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
-; LSE-NEXT: eor x11, x9, x10
; LSE-NEXT: mov x10, x0
; LSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill
-; LSE-NEXT: eor x8, x10, x8
-; LSE-NEXT: orr x8, x8, x11
+; LSE-NEXT: subs x11, x10, x11
+; LSE-NEXT: ccmp x9, x8, #0, eq
+; LSE-NEXT: cset w8, ne
; LSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
-; LSE-NEXT: cbnz x8, .LBB4_1
+; LSE-NEXT: tbnz w8, #0, .LBB4_1
; LSE-NEXT: b .LBB4_2
; LSE-NEXT: .LBB4_2: // %atomicrmw.end
; LSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
@@ -606,42 +608,44 @@ define i128 @test_rmw_nand_128(i128* %dst) {
; NOLSE-NEXT: // =>This Loop Header: Depth=1
; NOLSE-NEXT: // Child Loop BB9_2 Depth 2
; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload
-; NOLSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload
-; NOLSE-NEXT: ldr x13, [sp, #24] // 8-byte Folded Reload
-; NOLSE-NEXT: mov w9, w8
-; NOLSE-NEXT: mvn w10, w9
-; NOLSE-NEXT: // implicit-def: $x9
-; NOLSE-NEXT: mov w9, w10
-; NOLSE-NEXT: orr x14, x9, #0xfffffffffffffffe
+; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT: ldr x10, [sp, #24] // 8-byte Folded Reload
+; NOLSE-NEXT: mov w8, w13
+; NOLSE-NEXT: mvn w9, w8
+; NOLSE-NEXT: // implicit-def: $x8
+; NOLSE-NEXT: mov w8, w9
+; NOLSE-NEXT: orr x14, x8, #0xfffffffffffffffe
; NOLSE-NEXT: mov x15, #-1
; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start
; NOLSE-NEXT: // Parent Loop BB9_1 Depth=1
; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxp x10, x9, [x13]
-; NOLSE-NEXT: cmp x10, x8
-; NOLSE-NEXT: cset w12, ne
-; NOLSE-NEXT: cmp x9, x11
-; NOLSE-NEXT: cinc w12, w12, ne
-; NOLSE-NEXT: cbnz w12, .LBB9_4
+; NOLSE-NEXT: ldaxp x12, x8, [x10]
+; NOLSE-NEXT: cmp x12, x13
+; NOLSE-NEXT: cset w9, ne
+; NOLSE-NEXT: cmp x8, x11
+; NOLSE-NEXT: cinc w9, w9, ne
+; NOLSE-NEXT: cbnz w9, .LBB9_4
; NOLSE-NEXT: // %bb.3: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2
-; NOLSE-NEXT: stlxp w12, x14, x15, [x13]
-; NOLSE-NEXT: cbnz w12, .LBB9_2
+; NOLSE-NEXT: stlxp w9, x14, x15, [x10]
+; NOLSE-NEXT: cbnz w9, .LBB9_2
; NOLSE-NEXT: b .LBB9_5
; NOLSE-NEXT: .LBB9_4: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2
-; NOLSE-NEXT: stlxp w12, x10, x9, [x13]
-; NOLSE-NEXT: cbnz w12, .LBB9_2
+; NOLSE-NEXT: stlxp w9, x12, x8, [x10]
+; NOLSE-NEXT: cbnz w9, .LBB9_2
; NOLSE-NEXT: .LBB9_5: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1
-; NOLSE-NEXT: eor x11, x9, x11
-; NOLSE-NEXT: eor x8, x10, x8
-; NOLSE-NEXT: orr x8, x8, x11
+; NOLSE-NEXT: mov x9, x8
; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
+; NOLSE-NEXT: mov x10, x12
; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT: subs x12, x12, x13
+; NOLSE-NEXT: ccmp x8, x11, #0, eq
+; NOLSE-NEXT: cset w8, ne
; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
-; NOLSE-NEXT: cbnz x8, .LBB9_1
+; NOLSE-NEXT: tbnz w8, #0, .LBB9_1
; NOLSE-NEXT: b .LBB9_6
; NOLSE-NEXT: .LBB9_6: // %atomicrmw.end
; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
@@ -661,30 +665,30 @@ define i128 @test_rmw_nand_128(i128* %dst) {
; LSE-NEXT: b .LBB9_1
; LSE-NEXT: .LBB9_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
-; LSE-NEXT: ldr x10, [sp, #40] // 8-byte Folded Reload
-; LSE-NEXT: ldr x8, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
-; LSE-NEXT: mov x0, x8
-; LSE-NEXT: mov x1, x10
-; LSE-NEXT: mov w11, w8
-; LSE-NEXT: mvn w12, w11
-; LSE-NEXT: // implicit-def: $x11
-; LSE-NEXT: mov w11, w12
-; LSE-NEXT: orr x2, x11, #0xfffffffffffffffe
-; LSE-NEXT: mov x11, #-1
+; LSE-NEXT: mov x0, x11
+; LSE-NEXT: mov x1, x8
+; LSE-NEXT: mov w10, w11
+; LSE-NEXT: mvn w12, w10
+; LSE-NEXT: // implicit-def: $x10
+; LSE-NEXT: mov w10, w12
+; LSE-NEXT: orr x2, x10, #0xfffffffffffffffe
+; LSE-NEXT: mov x10, #-1
; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3
-; LSE-NEXT: mov x3, x11
+; LSE-NEXT: mov x3, x10
; LSE-NEXT: caspal x0, x1, x2, x3, [x9]
; LSE-NEXT: mov x9, x1
; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
-; LSE-NEXT: eor x11, x9, x10
; LSE-NEXT: mov x10, x0
; LSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill
-; LSE-NEXT: eor x8, x10, x8
-; LSE-NEXT: orr x8, x8, x11
+; LSE-NEXT: subs x11, x10, x11
+; LSE-NEXT: ccmp x9, x8, #0, eq
+; LSE-NEXT: cset w8, ne
; LSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
-; LSE-NEXT: cbnz x8, .LBB9_1
+; LSE-NEXT: tbnz w8, #0, .LBB9_1
; LSE-NEXT: b .LBB9_2
; LSE-NEXT: .LBB9_2: // %atomicrmw.end
; LSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
index 8a2429b064adc..60b0c37cb8535 100644
--- a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
+++ b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
@@ -12,10 +12,8 @@ define i1 @test_b2(i8* %s1, i8* %s2) {
; CHECKN-NEXT: ldr x9, [x1]
; CHECKN-NEXT: ldur x10, [x0, #7]
; CHECKN-NEXT: ldur x11, [x1, #7]
-; CHECKN-NEXT: eor x8, x8, x9
-; CHECKN-NEXT: eor x9, x10, x11
-; CHECKN-NEXT: orr x8, x8, x9
-; CHECKN-NEXT: cmp x8, #0
+; CHECKN-NEXT: cmp x8, x9
+; CHECKN-NEXT: ccmp x10, x11, #0, eq
; CHECKN-NEXT: cset w0, eq
; CHECKN-NEXT: ret
;
@@ -44,10 +42,8 @@ define i1 @test_b2_align8(i8* align 8 %s1, i8* align 8 %s2) {
; CHECKN-NEXT: ldr x9, [x1]
; CHECKN-NEXT: ldur x10, [x0, #7]
; CHECKN-NEXT: ldur x11, [x1, #7]
-; CHECKN-NEXT: eor x8, x8, x9
-; CHECKN-NEXT: eor x9, x10, x11
-; CHECKN-NEXT: orr x8, x8, x9
-; CHECKN-NEXT: cmp x8, #0
+; CHECKN-NEXT: cmp x8, x9
+; CHECKN-NEXT: ccmp x10, x11, #0, eq
; CHECKN-NEXT: cset w0, eq
; CHECKN-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll
index ff94642857e63..510c64ee1111a 100644
--- a/llvm/test/CodeGen/AArch64/bcmp.ll
+++ b/llvm/test/CodeGen/AArch64/bcmp.ll
@@ -113,10 +113,8 @@ define i1 @bcmp7(ptr %a, ptr %b) {
; CHECK-NEXT: ldr w9, [x1]
; CHECK-NEXT: ldur w10, [x0, #3]
; CHECK-NEXT: ldur w11, [x1, #3]
-; CHECK-NEXT: eor w8, w8, w9
-; CHECK-NEXT: eor w9, w10, w11
-; CHECK-NEXT: orr w8, w8, w9
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cmp w8, w9
+; CHECK-NEXT: ccmp w10, w11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 7)
@@ -182,10 +180,8 @@ define i1 @bcmp11(ptr %a, ptr %b) {
; CHECK-NEXT: ldr x9, [x1]
; CHECK-NEXT: ldur x10, [x0, #3]
; CHECK-NEXT: ldur x11, [x1, #3]
-; CHECK-NEXT: eor x8, x8, x9
-; CHECK-NEXT: eor x9, x10, x11
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 11)
@@ -218,10 +214,8 @@ define i1 @bcmp13(ptr %a, ptr %b) {
; CHECK-NEXT: ldr x9, [x1]
; CHECK-NEXT: ldur x10, [x0, #5]
; CHECK-NEXT: ldur x11, [x1, #5]
-; CHECK-NEXT: eor x8, x8, x9
-; CHECK-NEXT: eor x9, x10, x11
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 13)
@@ -236,10 +230,8 @@ define i1 @bcmp14(ptr %a, ptr %b) {
; CHECK-NEXT: ldr x9, [x1]
; CHECK-NEXT: ldur x10, [x0, #6]
; CHECK-NEXT: ldur x11, [x1, #6]
-; CHECK-NEXT: eor x8, x8, x9
-; CHECK-NEXT: eor x9, x10, x11
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 14)
@@ -254,10 +246,8 @@ define i1 @bcmp15(ptr %a, ptr %b) {
; CHECK-NEXT: ldr x9, [x1]
; CHECK-NEXT: ldur x10, [x0, #7]
; CHECK-NEXT: ldur x11, [x1, #7]
-; CHECK-NEXT: eor x8, x8, x9
-; CHECK-NEXT: eor x9, x10, x11
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 15)
@@ -270,10 +260,8 @@ define i1 @bcmp16(ptr %a, ptr %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldp x8, x9, [x0]
; CHECK-NEXT: ldp x10, x11, [x1]
-; CHECK-NEXT: eor x8, x8, x10
-; CHECK-NEXT: eor x9, x9, x11
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cmp x8, x10
+; CHECK-NEXT: ccmp x9, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 16)
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
index f826a80940468..40794ee1627e2 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
@@ -128,3 +128,123 @@ define i1 @combine_setcc_ne_vecreduce_or_v64i1(<64 x i8> %a) {
%cmp2 = icmp ne i64 %cast, zeroinitializer
ret i1 %cmp2
}
+
+define i1 @combine_setcc_eq0_conjunction_xor_or(ptr %a, ptr %b) {
+; CHECK-LABEL: combine_setcc_eq0_conjunction_xor_or:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp x8, x9, [x0]
+; CHECK-NEXT: ldp x10, x11, [x1]
+; CHECK-NEXT: cmp x8, x10
+; CHECK-NEXT: ccmp x9, x11, #0, eq
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16)
+ %cmp = icmp eq i32 %bcmp, 0
+ ret i1 %cmp
+}
+
+define i1 @combine_setcc_ne0_conjunction_xor_or(ptr %a, ptr %b) {
+; CHECK-LABEL: combine_setcc_ne0_conjunction_xor_or:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp x8, x9, [x0]
+; CHECK-NEXT: ldp x10, x11, [x1]
+; CHECK-NEXT: cmp x8, x10
+; CHECK-NEXT: ccmp x9, x11, #0, eq
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16)
+ %cmp = icmp ne i32 %bcmp, 0
+ ret i1 %cmp
+}
+
+; Doesn't increase the number of instructions, where the LHS has multiple uses
+define i32 @combine_setcc_multiuse(i32 %0, i32 %1, i32 %2, i32 %3) {
+; CHECK-LABEL: combine_setcc_multiuse:
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor w8, w1, w0
+; CHECK-NEXT: eor w9, w3, w2
+; CHECK-NEXT: orr w8, w9, w8
+; CHECK-NEXT: cbz w8, .LBB10_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: b use
+; CHECK-NEXT: .LBB10_2:
+; CHECK-NEXT: ret
+ %5 = xor i32 %1, %0
+ %6 = xor i32 %3, %2
+ %7 = or i32 %6, %5
+ %8 = icmp eq i32 %7, 0
+ br i1 %8, label %11, label %9
+
+9: ; preds = %4
+ %10 = tail call i32 @use(i32 %7) #2
+ br label %11
+
+11: ; preds = %4, %9
+ %12 = phi i32 [ %10, %9 ], [ %0, %4 ]
+ ret i32 %12
+}
+
+; There may be issues with the CMP/CCMP with the scheduling of instructions
+; that ISel will create out of the DAG
+define i32 @combine_setcc_glue(i128 noundef %x, i128 noundef %y) {
+; CHECK-LABEL: combine_setcc_glue:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cmp x0, x2
+; CHECK-NEXT: cset w8, eq
+; CHECK-NEXT: ccmp x1, x3, #0, eq
+; CHECK-NEXT: cset w9, eq
+; CHECK-NEXT: orr w0, w9, w8
+; CHECK-NEXT: ret
+entry:
+ %cmp3 = icmp eq i128 %x, %y
+ %conv = trunc i128 %x to i64
+ %conv1 = trunc i128 %y to i64
+ %cmp = icmp eq i64 %conv, %conv1
+ %or7 = or i1 %cmp3, %cmp
+ %or = zext i1 %or7 to i32
+ ret i32 %or
+}
+
+; Reduced test from https://github.com/llvm/llvm-project/issues/58675
+define [2 x i64] @PR58675(i128 %a.addr, i128 %b.addr) {
+; CHECK-LABEL: PR58675:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov x9, xzr
+; CHECK-NEXT: .LBB12_1: // %do.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: cmp x0, x8
+; CHECK-NEXT: csel x10, x0, x8, lo
+; CHECK-NEXT: cmp x1, x9
+; CHECK-NEXT: csel x8, x0, x8, lo
+; CHECK-NEXT: csel x8, x10, x8, eq
+; CHECK-NEXT: csel x10, x1, x9, lo
+; CHECK-NEXT: subs x8, x2, x8
+; CHECK-NEXT: sbc x9, x3, x10
+; CHECK-NEXT: ccmp x3, x10, #0, eq
+; CHECK-NEXT: b.ne .LBB12_1
+; CHECK-NEXT: // %bb.2: // %do.end
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: mov x1, xzr
+; CHECK-NEXT: ret
+entry:
+ br label %do.body
+
+do.body: ; preds = %do.body, %entry
+ %a.addr.i1 = phi i128 [ 1, %do.body ], [ 0, %entry ]
+ %b.addr.i2 = phi i128 [ %sub, %do.body ], [ 0, %entry ]
+ %0 = tail call i128 @llvm.umin.i128(i128 %a.addr, i128 %b.addr.i2)
+ %1 = tail call i128 @llvm.umax.i128(i128 0, i128 %a.addr)
+ %sub = sub i128 %b.addr, %0
+ %cmp18.not = icmp eq i128 %b.addr, %0
+ br i1 %cmp18.not, label %do.end, label %do.body
+
+do.end: ; preds = %do.body
+ ret [2 x i64] zeroinitializer
+}
+
+declare i128 @llvm.umin.i128(i128, i128)
+declare i128 @llvm.umax.i128(i128, i128)
+declare i32 @bcmp(ptr nocapture, ptr nocapture, i64)
+declare i32 @use(i32 noundef)
diff --git a/llvm/test/CodeGen/AArch64/i128-cmp.ll b/llvm/test/CodeGen/AArch64/i128-cmp.ll
index 7cc3e843ba247..b50a559434302 100644
--- a/llvm/test/CodeGen/AArch64/i128-cmp.ll
+++ b/llvm/test/CodeGen/AArch64/i128-cmp.ll
@@ -6,10 +6,8 @@ declare void @call()
define i1 @cmp_i128_eq(i128 %a, i128 %b) {
; CHECK-LABEL: cmp_i128_eq:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor x8, x1, x3
-; CHECK-NEXT: eor x9, x0, x2
-; CHECK-NEXT: orr x8, x9, x8
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cmp x0, x2
+; CHECK-NEXT: ccmp x1, x3, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cmp = icmp eq i128 %a, %b
@@ -19,10 +17,8 @@ define i1 @cmp_i128_eq(i128 %a, i128 %b) {
define i1 @cmp_i128_ne(i128 %a, i128 %b) {
; CHECK-LABEL: cmp_i128_ne:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor x8, x1, x3
-; CHECK-NEXT: eor x9, x0, x2
-; CHECK-NEXT: orr x8, x9, x8
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cmp x0, x2
+; CHECK-NEXT: ccmp x1, x3, #0, eq
; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
%cmp = icmp ne i128 %a, %b
@@ -120,10 +116,9 @@ define i1 @cmp_i128_sle(i128 %a, i128 %b) {
define void @br_on_cmp_i128_eq(i128 %a, i128 %b) nounwind {
; CHECK-LABEL: br_on_cmp_i128_eq:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor x8, x1, x3
-; CHECK-NEXT: eor x9, x0, x2
-; CHECK-NEXT: orr x8, x9, x8
-; CHECK-NEXT: cbnz x8, .LBB10_2
+; CHECK-NEXT: cmp x0, x2
+; CHECK-NEXT: ccmp x1, x3, #0, eq
+; CHECK-NEXT: b.ne .LBB10_2
; CHECK-NEXT: // %bb.1: // %call
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: bl call
@@ -142,10 +137,9 @@ exit:
define void @br_on_cmp_i128_ne(i128 %a, i128 %b) nounwind {
; CHECK-LABEL: br_on_cmp_i128_ne:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor x8, x1, x3
-; CHECK-NEXT: eor x9, x0, x2
-; CHECK-NEXT: orr x8, x9, x8
-; CHECK-NEXT: cbz x8, .LBB11_2
+; CHECK-NEXT: cmp x0, x2
+; CHECK-NEXT: ccmp x1, x3, #0, eq
+; CHECK-NEXT: b.eq .LBB11_2
; CHECK-NEXT: // %bb.1: // %call
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: bl call
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index e955014371525..e298748e8ec26 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -68,12 +68,10 @@ define i128 @__muloti4(i128 %0, i128 %1, i32* nocapture nonnull writeonly align
; AARCH-NEXT: adds x11, x12, x11
; AARCH-NEXT: adc x12, x13, x14
; AARCH-NEXT: adds x10, x11, x10
-; AARCH-NEXT: adc x9, x12, x9
; AARCH-NEXT: asr x11, x1, #63
-; AARCH-NEXT: eor x9, x9, x11
-; AARCH-NEXT: eor x10, x10, x11
-; AARCH-NEXT: orr x9, x10, x9
-; AARCH-NEXT: cmp x9, #0
+; AARCH-NEXT: adc x9, x12, x9
+; AARCH-NEXT: cmp x10, x11
+; AARCH-NEXT: ccmp x9, x11, #0, eq
; AARCH-NEXT: cset w9, ne
; AARCH-NEXT: tbz x8, #63, .LBB1_2
; AARCH-NEXT: // %bb.1: // %Entry
More information about the llvm-commits
mailing list