[llvm] 3651bc8 - [AArch64] Optimize cmp chain before legalization
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 23 03:48:25 PST 2022
Author: zhongyunde
Date: 2022-11-23T19:43:44+08:00
New Revision: 3651bc83b6f28d2ae2d6b32db661f0d62cb600d7
URL: https://github.com/llvm/llvm-project/commit/3651bc83b6f28d2ae2d6b32db661f0d62cb600d7
DIFF: https://github.com/llvm/llvm-project/commit/3651bc83b6f28d2ae2d6b32db661f0d62cb600d7.diff
LOG: [AArch64] Optimize cmp chain before legalization
* For case bcmp9, there is extras AND and EXTEND int the chain of OR/XOR,
which prevent the transform, so enable the optimize before legalization.
* The key IR frag related:
t37: i32,ch = load<(load (s8) from %ir.4), anyext from i8> t0, t11, undef:i64
t12: i64 = add t4, Constant:i64<8>
t38: i32,ch = load<(load (s8) from %ir.5), anyext from i8> t0, t12, undef:i64
t39: i32 = xor t37, t38
t40: i64 = any_extend t39
t42: i64 = and t40, Constant:i64<255>
Depends on D138398 to fix combine_setcc_glue
Reviewed By: dmgreen, bcl5980
Differential Revision: https://reviews.llvm.org/D137936
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
llvm/test/CodeGen/AArch64/bcmp.ll
llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
llvm/test/CodeGen/AArch64/i128-cmp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 49f84a07a8820..6fca41fe7124c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8579,6 +8579,10 @@ isOrXorChain(SDValue N, unsigned &Num,
if (Num == MaxXors)
return false;
+ // Skip the one-use zext
+ if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
+ N = N->getOperand(0);
+
// The leaf node must be XOR
if (N->getOpcode() == ISD::XOR) {
WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
@@ -8615,29 +8619,18 @@ static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {
if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
isOrXorChain(LHS, NumXors, WorkList)) {
- SDValue CCVal = DAG.getConstant(AArch64CC::EQ, DL, MVT_CC);
- EVT TstVT = LHS->getValueType(0);
SDValue XOR0, XOR1;
std::tie(XOR0, XOR1) = WorkList[0];
- SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL,
- DAG.getVTList(TstVT, MVT::i32), XOR0, XOR1);
- SDValue Overflow = Cmp.getValue(1);
- SDValue CCmp;
+ unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
+ SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
for (unsigned I = 1; I < WorkList.size(); I++) {
std::tie(XOR0, XOR1) = WorkList[I];
- SDValue NZCVOp = DAG.getConstant(0, DL, MVT::i32);
- CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, XOR0, XOR1, NZCVOp,
- CCVal, Overflow);
- Overflow = CCmp;
+ SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
+ Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
}
// Exit early by inverting the condition, which help reduce indentations.
- SDValue TVal = DAG.getConstant(1, DL, VT);
- SDValue FVal = DAG.getConstant(0, DL, VT);
- AArch64CC::CondCode CC = changeIntCCToAArch64CC(Cond);
- AArch64CC::CondCode InvCC = AArch64CC::getInvertedCondCode(CC);
- return DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal,
- DAG.getConstant(InvCC, DL, MVT::i32), CCmp);
+ return Cmp;
}
return SDValue();
@@ -8678,11 +8671,6 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
}
}
- // Address some cases folded And in the stage of `Optimized type-legalized
- // selection`
- if (SDValue V = performOrXorChainCombine(Op.getNode(), DAG))
- return V;
-
if (LHS.getValueType().isInteger()) {
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(
@@ -19755,9 +19743,8 @@ static SDValue performSETCCCombine(SDNode *N,
}
// Try to perform the memcmp when the result is tested for [in]equality with 0
- if (!DCI.isBeforeLegalize())
- if (SDValue V = performOrXorChainCombine(N, DAG))
- return V;
+ if (SDValue V = performOrXorChainCombine(N, DAG))
+ return V;
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
index ec5f8e2524994..3fd7d9addf8c0 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-O0.ll
@@ -215,41 +215,40 @@ define i128 @test_rmw_add_128(i128* %dst) {
; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start
; NOLSE-NEXT: // =>This Loop Header: Depth=1
; NOLSE-NEXT: // Child Loop BB4_2 Depth 2
-; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload
-; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload
-; NOLSE-NEXT: ldr x10, [sp, #24] // 8-byte Folded Reload
-; NOLSE-NEXT: adds x14, x13, #1
-; NOLSE-NEXT: cinc x15, x11, hs
+; NOLSE-NEXT: ldr x13, [sp, #40] // 8-byte Folded Reload
+; NOLSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
+; NOLSE-NEXT: adds x14, x11, #1
+; NOLSE-NEXT: cinc x15, x13, hs
; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start
; NOLSE-NEXT: // Parent Loop BB4_1 Depth=1
; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxp x12, x8, [x10]
+; NOLSE-NEXT: ldaxp x10, x12, [x9]
+; NOLSE-NEXT: cmp x10, x11
+; NOLSE-NEXT: cset w8, ne
; NOLSE-NEXT: cmp x12, x13
-; NOLSE-NEXT: cset w9, ne
-; NOLSE-NEXT: cmp x8, x11
-; NOLSE-NEXT: cinc w9, w9, ne
-; NOLSE-NEXT: cbnz w9, .LBB4_4
+; NOLSE-NEXT: cinc w8, w8, ne
+; NOLSE-NEXT: cbnz w8, .LBB4_4
; NOLSE-NEXT: // %bb.3: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2
-; NOLSE-NEXT: stlxp w9, x14, x15, [x10]
-; NOLSE-NEXT: cbnz w9, .LBB4_2
+; NOLSE-NEXT: stlxp w8, x14, x15, [x9]
+; NOLSE-NEXT: cbnz w8, .LBB4_2
; NOLSE-NEXT: b .LBB4_5
; NOLSE-NEXT: .LBB4_4: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=2
-; NOLSE-NEXT: stlxp w9, x12, x8, [x10]
-; NOLSE-NEXT: cbnz w9, .LBB4_2
+; NOLSE-NEXT: stlxp w8, x10, x12, [x9]
+; NOLSE-NEXT: cbnz w8, .LBB4_2
; NOLSE-NEXT: .LBB4_5: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB4_1 Depth=1
-; NOLSE-NEXT: mov x9, x8
-; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
-; NOLSE-NEXT: mov x10, x12
-; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT: mov x8, x12
+; NOLSE-NEXT: str x8, [sp, #8] // 8-byte Folded Spill
+; NOLSE-NEXT: mov x9, x10
+; NOLSE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
; NOLSE-NEXT: subs x12, x12, x13
-; NOLSE-NEXT: ccmp x8, x11, #0, eq
-; NOLSE-NEXT: cset w8, ne
-; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
-; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
-; NOLSE-NEXT: tbnz w8, #0, .LBB4_1
+; NOLSE-NEXT: ccmp x10, x11, #0, eq
+; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill
+; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill
+; NOLSE-NEXT: b.ne .LBB4_1
; NOLSE-NEXT: b .LBB4_6
; NOLSE-NEXT: .LBB4_6: // %atomicrmw.end
; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
@@ -269,30 +268,29 @@ define i128 @test_rmw_add_128(i128* %dst) {
; LSE-NEXT: b .LBB4_1
; LSE-NEXT: .LBB4_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
-; LSE-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload
-; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
-; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
-; LSE-NEXT: mov x0, x11
-; LSE-NEXT: mov x1, x8
-; LSE-NEXT: adds x2, x11, #1
-; LSE-NEXT: cinc x10, x8, hs
+; LSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT: ldr x8, [sp, #24] // 8-byte Folded Reload
+; LSE-NEXT: mov x0, x10
+; LSE-NEXT: mov x1, x11
+; LSE-NEXT: adds x2, x10, #1
+; LSE-NEXT: cinc x9, x11, hs
; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3
-; LSE-NEXT: mov x3, x10
-; LSE-NEXT: caspal x0, x1, x2, x3, [x9]
-; LSE-NEXT: mov x9, x1
+; LSE-NEXT: mov x3, x9
+; LSE-NEXT: caspal x0, x1, x2, x3, [x8]
+; LSE-NEXT: mov x9, x0
; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
-; LSE-NEXT: mov x10, x0
-; LSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill
-; LSE-NEXT: subs x11, x10, x11
-; LSE-NEXT: ccmp x9, x8, #0, eq
-; LSE-NEXT: cset w8, ne
-; LSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
-; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
-; LSE-NEXT: tbnz w8, #0, .LBB4_1
+; LSE-NEXT: mov x8, x1
+; LSE-NEXT: str x8, [sp, #16] // 8-byte Folded Spill
+; LSE-NEXT: subs x11, x8, x11
+; LSE-NEXT: ccmp x9, x10, #0, eq
+; LSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill
+; LSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill
+; LSE-NEXT: b.ne .LBB4_1
; LSE-NEXT: b .LBB4_2
; LSE-NEXT: .LBB4_2: // %atomicrmw.end
-; LSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
-; LSE-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload
+; LSE-NEXT: ldr x1, [sp, #16] // 8-byte Folded Reload
+; LSE-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload
; LSE-NEXT: add sp, sp, #48
; LSE-NEXT: ret
entry:
@@ -607,45 +605,44 @@ define i128 @test_rmw_nand_128(i128* %dst) {
; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start
; NOLSE-NEXT: // =>This Loop Header: Depth=1
; NOLSE-NEXT: // Child Loop BB9_2 Depth 2
-; NOLSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload
-; NOLSE-NEXT: ldr x13, [sp, #32] // 8-byte Folded Reload
-; NOLSE-NEXT: ldr x10, [sp, #24] // 8-byte Folded Reload
-; NOLSE-NEXT: mov w8, w13
-; NOLSE-NEXT: mvn w9, w8
+; NOLSE-NEXT: ldr x13, [sp, #40] // 8-byte Folded Reload
+; NOLSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
+; NOLSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
+; NOLSE-NEXT: mov w8, w11
+; NOLSE-NEXT: mvn w10, w8
; NOLSE-NEXT: // implicit-def: $x8
-; NOLSE-NEXT: mov w8, w9
+; NOLSE-NEXT: mov w8, w10
; NOLSE-NEXT: orr x14, x8, #0xfffffffffffffffe
; NOLSE-NEXT: mov x15, #-1
; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start
; NOLSE-NEXT: // Parent Loop BB9_1 Depth=1
; NOLSE-NEXT: // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT: ldaxp x12, x8, [x10]
+; NOLSE-NEXT: ldaxp x10, x12, [x9]
+; NOLSE-NEXT: cmp x10, x11
+; NOLSE-NEXT: cset w8, ne
; NOLSE-NEXT: cmp x12, x13
-; NOLSE-NEXT: cset w9, ne
-; NOLSE-NEXT: cmp x8, x11
-; NOLSE-NEXT: cinc w9, w9, ne
-; NOLSE-NEXT: cbnz w9, .LBB9_4
+; NOLSE-NEXT: cinc w8, w8, ne
+; NOLSE-NEXT: cbnz w8, .LBB9_4
; NOLSE-NEXT: // %bb.3: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2
-; NOLSE-NEXT: stlxp w9, x14, x15, [x10]
-; NOLSE-NEXT: cbnz w9, .LBB9_2
+; NOLSE-NEXT: stlxp w8, x14, x15, [x9]
+; NOLSE-NEXT: cbnz w8, .LBB9_2
; NOLSE-NEXT: b .LBB9_5
; NOLSE-NEXT: .LBB9_4: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=2
-; NOLSE-NEXT: stlxp w9, x12, x8, [x10]
-; NOLSE-NEXT: cbnz w9, .LBB9_2
+; NOLSE-NEXT: stlxp w8, x10, x12, [x9]
+; NOLSE-NEXT: cbnz w8, .LBB9_2
; NOLSE-NEXT: .LBB9_5: // %atomicrmw.start
; NOLSE-NEXT: // in Loop: Header=BB9_1 Depth=1
-; NOLSE-NEXT: mov x9, x8
-; NOLSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
-; NOLSE-NEXT: mov x10, x12
-; NOLSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill
+; NOLSE-NEXT: mov x8, x12
+; NOLSE-NEXT: str x8, [sp, #8] // 8-byte Folded Spill
+; NOLSE-NEXT: mov x9, x10
+; NOLSE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
; NOLSE-NEXT: subs x12, x12, x13
-; NOLSE-NEXT: ccmp x8, x11, #0, eq
-; NOLSE-NEXT: cset w8, ne
-; NOLSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
-; NOLSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
-; NOLSE-NEXT: tbnz w8, #0, .LBB9_1
+; NOLSE-NEXT: ccmp x10, x11, #0, eq
+; NOLSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill
+; NOLSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill
+; NOLSE-NEXT: b.ne .LBB9_1
; NOLSE-NEXT: b .LBB9_6
; NOLSE-NEXT: .LBB9_6: // %atomicrmw.end
; NOLSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
@@ -665,34 +662,33 @@ define i128 @test_rmw_nand_128(i128* %dst) {
; LSE-NEXT: b .LBB9_1
; LSE-NEXT: .LBB9_1: // %atomicrmw.start
; LSE-NEXT: // =>This Inner Loop Header: Depth=1
-; LSE-NEXT: ldr x8, [sp, #40] // 8-byte Folded Reload
-; LSE-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
-; LSE-NEXT: ldr x9, [sp, #24] // 8-byte Folded Reload
-; LSE-NEXT: mov x0, x11
-; LSE-NEXT: mov x1, x8
-; LSE-NEXT: mov w10, w11
-; LSE-NEXT: mvn w12, w10
-; LSE-NEXT: // implicit-def: $x10
-; LSE-NEXT: mov w10, w12
-; LSE-NEXT: orr x2, x10, #0xfffffffffffffffe
-; LSE-NEXT: mov x10, #-1
+; LSE-NEXT: ldr x11, [sp, #40] // 8-byte Folded Reload
+; LSE-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload
+; LSE-NEXT: ldr x8, [sp, #24] // 8-byte Folded Reload
+; LSE-NEXT: mov x0, x10
+; LSE-NEXT: mov x1, x11
+; LSE-NEXT: mov w9, w10
+; LSE-NEXT: mvn w12, w9
+; LSE-NEXT: // implicit-def: $x9
+; LSE-NEXT: mov w9, w12
+; LSE-NEXT: orr x2, x9, #0xfffffffffffffffe
+; LSE-NEXT: mov x9, #-1
; LSE-NEXT: // kill: def $x2 killed $x2 def $x2_x3
-; LSE-NEXT: mov x3, x10
-; LSE-NEXT: caspal x0, x1, x2, x3, [x9]
-; LSE-NEXT: mov x9, x1
+; LSE-NEXT: mov x3, x9
+; LSE-NEXT: caspal x0, x1, x2, x3, [x8]
+; LSE-NEXT: mov x9, x0
; LSE-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
-; LSE-NEXT: mov x10, x0
-; LSE-NEXT: str x10, [sp, #16] // 8-byte Folded Spill
-; LSE-NEXT: subs x11, x10, x11
-; LSE-NEXT: ccmp x9, x8, #0, eq
-; LSE-NEXT: cset w8, ne
-; LSE-NEXT: str x10, [sp, #32] // 8-byte Folded Spill
-; LSE-NEXT: str x9, [sp, #40] // 8-byte Folded Spill
-; LSE-NEXT: tbnz w8, #0, .LBB9_1
+; LSE-NEXT: mov x8, x1
+; LSE-NEXT: str x8, [sp, #16] // 8-byte Folded Spill
+; LSE-NEXT: subs x11, x8, x11
+; LSE-NEXT: ccmp x9, x10, #0, eq
+; LSE-NEXT: str x9, [sp, #32] // 8-byte Folded Spill
+; LSE-NEXT: str x8, [sp, #40] // 8-byte Folded Spill
+; LSE-NEXT: b.ne .LBB9_1
; LSE-NEXT: b .LBB9_2
; LSE-NEXT: .LBB9_2: // %atomicrmw.end
-; LSE-NEXT: ldr x1, [sp, #8] // 8-byte Folded Reload
-; LSE-NEXT: ldr x0, [sp, #16] // 8-byte Folded Reload
+; LSE-NEXT: ldr x1, [sp, #16] // 8-byte Folded Reload
+; LSE-NEXT: ldr x0, [sp, #8] // 8-byte Folded Reload
; LSE-NEXT: add sp, sp, #48
; LSE-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll
index 208ef4895795b..bf760d3b2eca6 100644
--- a/llvm/test/CodeGen/AArch64/bcmp.ll
+++ b/llvm/test/CodeGen/AArch64/bcmp.ll
@@ -133,19 +133,16 @@ define i1 @bcmp8(ptr %a, ptr %b) {
ret i1 %r
}
-; TODO: or (xor a, b), (and (xor c, d), C2)
+; or (xor a, b), (and (xor c, d), C2)
define i1 @bcmp9(ptr %a, ptr %b) {
; CHECK-LABEL: bcmp9:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrb w9, [x0, #8]
-; CHECK-NEXT: ldrb w10, [x1, #8]
; CHECK-NEXT: ldr x8, [x0]
-; CHECK-NEXT: ldr x11, [x1]
-; CHECK-NEXT: eor w9, w9, w10
-; CHECK-NEXT: and x9, x9, #0xff
-; CHECK-NEXT: eor x8, x8, x11
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: ldr x9, [x1]
+; CHECK-NEXT: ldrb w10, [x0, #8]
+; CHECK-NEXT: ldrb w11, [x1, #8]
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 9)
@@ -156,15 +153,12 @@ define i1 @bcmp9(ptr %a, ptr %b) {
define i1 @bcmp10(ptr %a, ptr %b) {
; CHECK-LABEL: bcmp10:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldrh w9, [x0, #8]
-; CHECK-NEXT: ldrh w10, [x1, #8]
; CHECK-NEXT: ldr x8, [x0]
-; CHECK-NEXT: ldr x11, [x1]
-; CHECK-NEXT: eor w9, w9, w10
-; CHECK-NEXT: and x9, x9, #0xffff
-; CHECK-NEXT: eor x8, x8, x11
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: ldr x9, [x1]
+; CHECK-NEXT: ldrh w10, [x0, #8]
+; CHECK-NEXT: ldrh w11, [x1, #8]
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 10)
@@ -195,10 +189,8 @@ define i1 @bcmp12(ptr %a, ptr %b) {
; CHECK-NEXT: ldr x9, [x1]
; CHECK-NEXT: ldr w10, [x0, #8]
; CHECK-NEXT: ldr w11, [x1, #8]
-; CHECK-NEXT: eor x8, x8, x9
-; CHECK-NEXT: eor w9, w10, w11
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: ccmp x10, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 12)
@@ -274,13 +266,10 @@ define i1 @bcmp20(ptr %a, ptr %b) {
; CHECK-NEXT: ldp x8, x9, [x0]
; CHECK-NEXT: ldp x10, x11, [x1]
; CHECK-NEXT: ldr w12, [x0, #16]
-; CHECK-NEXT: ldr w13, [x1, #16]
-; CHECK-NEXT: eor x8, x8, x10
-; CHECK-NEXT: eor x9, x9, x11
-; CHECK-NEXT: eor w10, w12, w13
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: orr x8, x8, x10
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cmp x8, x10
+; CHECK-NEXT: ldr w8, [x1, #16]
+; CHECK-NEXT: ccmp x9, x11, #0, eq
+; CHECK-NEXT: ccmp x12, x8, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 20)
@@ -311,17 +300,13 @@ define i1 @bcmp28(ptr %a, ptr %b) {
; CHECK-NEXT: ldp x8, x9, [x0]
; CHECK-NEXT: ldp x10, x11, [x1]
; CHECK-NEXT: ldr x12, [x0, #16]
-; CHECK-NEXT: ldr x13, [x1, #16]
-; CHECK-NEXT: ldr w14, [x0, #24]
-; CHECK-NEXT: eor x8, x8, x10
-; CHECK-NEXT: ldr w15, [x1, #24]
-; CHECK-NEXT: eor x9, x9, x11
-; CHECK-NEXT: eor x10, x12, x13
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: eor w11, w14, w15
-; CHECK-NEXT: orr x9, x10, x11
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cmp x8, x10
+; CHECK-NEXT: ldr x8, [x1, #16]
+; CHECK-NEXT: ccmp x9, x11, #0, eq
+; CHECK-NEXT: ldr w9, [x0, #24]
+; CHECK-NEXT: ldr w10, [x1, #24]
+; CHECK-NEXT: ccmp x12, x8, #0, eq
+; CHECK-NEXT: ccmp x9, x10, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 28)
@@ -334,21 +319,15 @@ define i1 @bcmp33(ptr %a, ptr %b) {
; CHECK: // %bb.0:
; CHECK-NEXT: ldp x8, x9, [x0]
; CHECK-NEXT: ldp x10, x11, [x1]
-; CHECK-NEXT: ldp x12, x13, [x0, #16]
-; CHECK-NEXT: ldp x14, x15, [x1, #16]
-; CHECK-NEXT: eor x8, x8, x10
-; CHECK-NEXT: eor x9, x9, x11
-; CHECK-NEXT: ldrb w16, [x0, #32]
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: ldrb w17, [x1, #32]
-; CHECK-NEXT: eor x10, x12, x14
-; CHECK-NEXT: eor x11, x13, x15
-; CHECK-NEXT: eor w12, w16, w17
-; CHECK-NEXT: orr x9, x10, x11
-; CHECK-NEXT: and x10, x12, #0xff
-; CHECK-NEXT: orr x8, x8, x9
-; CHECK-NEXT: orr x8, x8, x10
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cmp x8, x10
+; CHECK-NEXT: ccmp x9, x11, #0, eq
+; CHECK-NEXT: ldrb w11, [x1, #32]
+; CHECK-NEXT: ldp x8, x9, [x0, #16]
+; CHECK-NEXT: ldp x12, x10, [x1, #16]
+; CHECK-NEXT: ccmp x8, x12, #0, eq
+; CHECK-NEXT: ldrb w8, [x0, #32]
+; CHECK-NEXT: ccmp x9, x10, #0, eq
+; CHECK-NEXT: ccmp x8, x11, #0, eq
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cr = call i32 @bcmp(ptr %a, ptr %b, i64 33)
@@ -450,3 +429,110 @@ define i1 @bcmp89(ptr %a, ptr %b) {
ret i1 %r
}
+define i1 @bcmp_zext(i32 %0, i32 %1, i8 %2, i8 %3) {
+; CHECK-LABEL: bcmp_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w2, #0xff
+; CHECK-NEXT: and w9, w3, #0xff
+; CHECK-NEXT: cmp w1, w0
+; CHECK-NEXT: ccmp w9, w8, #0, eq
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %5 = xor i32 %1, %0
+ %6 = xor i8 %3, %2
+ %7 = zext i8 %6 to i32
+ %8 = or i32 %5, %7
+ %9 = icmp eq i32 %8, 0
+ ret i1 %9
+}
+
+define i1 @bcmp_i8(i8 %a0, i8 %b0, i8 %a1, i8 %b1, i8 %a2, i8 %b2) {
+; CHECK-LABEL: bcmp_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w9, w1, #0xff
+; CHECK-NEXT: and w8, w2, #0xff
+; CHECK-NEXT: and w10, w3, #0xff
+; CHECK-NEXT: cmp w9, w0, uxtb
+; CHECK-NEXT: ccmp w10, w8, #0, eq
+; CHECK-NEXT: and w8, w4, #0xff
+; CHECK-NEXT: and w9, w5, #0xff
+; CHECK-NEXT: ccmp w9, w8, #0, eq
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %xor0 = xor i8 %b0, %a0
+ %xor1 = xor i8 %b1, %a1
+ %xor2 = xor i8 %b2, %a2
+ %or0 = or i8 %xor0, %xor1
+ %or1 = or i8 %or0, %xor2
+ %r = icmp eq i8 %or1, 0
+ ret i1 %r
+}
+
+define i1 @bcmp_i16(i16 %a0, i16 %b0, i16 %a1, i16 %b1, i16 %a2, i16 %b2) {
+; CHECK-LABEL: bcmp_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w9, w1, #0xffff
+; CHECK-NEXT: and w8, w2, #0xffff
+; CHECK-NEXT: and w10, w3, #0xffff
+; CHECK-NEXT: cmp w9, w0, uxth
+; CHECK-NEXT: ccmp w10, w8, #0, eq
+; CHECK-NEXT: and w8, w4, #0xffff
+; CHECK-NEXT: and w9, w5, #0xffff
+; CHECK-NEXT: ccmp w9, w8, #0, eq
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %xor0 = xor i16 %b0, %a0
+ %xor1 = xor i16 %b1, %a1
+ %xor2 = xor i16 %b2, %a2
+ %or0 = or i16 %xor0, %xor1
+ %or1 = or i16 %or0, %xor2
+ %r = icmp eq i16 %or1, 0
+ ret i1 %r
+}
+
+define i1 @bcmp_i128(i128 %a0, i128 %b0, i128 %a1, i128 %b1, i128 %a2, i128 %b2) {
+; CHECK-LABEL: bcmp_i128:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldp x9, x8, [sp]
+; CHECK-NEXT: ldp x10, x11, [sp, #16]
+; CHECK-NEXT: cmp x10, x9
+; CHECK-NEXT: ccmp x11, x8, #0, eq
+; CHECK-NEXT: cset w8, ne
+; CHECK-NEXT: cmp x2, x0
+; CHECK-NEXT: ccmp x3, x1, #0, eq
+; CHECK-NEXT: ccmp x6, x4, #0, eq
+; CHECK-NEXT: ccmp x7, x5, #0, eq
+; CHECK-NEXT: cset w9, ne
+; CHECK-NEXT: orr w0, w9, w8
+; CHECK-NEXT: ret
+ %xor0 = xor i128 %b0, %a0
+ %xor1 = xor i128 %b1, %a1
+ %xor2 = xor i128 %b2, %a2
+ %or0 = or i128 %xor0, %xor1
+ %or1 = or i128 %or0, %xor2
+ %r = icmp ne i128 %or1, 0
+ ret i1 %r
+}
+
+define i1 @bcmp_i42(i42 %a0, i42 %b0, i42 %a1, i42 %b1, i42 %a2, i42 %b2) {
+; CHECK-LABEL: bcmp_i42:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and x9, x0, #0x3ffffffffff
+; CHECK-NEXT: and x10, x1, #0x3ffffffffff
+; CHECK-NEXT: and x8, x2, #0x3ffffffffff
+; CHECK-NEXT: and x11, x3, #0x3ffffffffff
+; CHECK-NEXT: cmp x10, x9
+; CHECK-NEXT: and x9, x5, #0x3ffffffffff
+; CHECK-NEXT: ccmp x11, x8, #0, eq
+; CHECK-NEXT: and x8, x4, #0x3ffffffffff
+; CHECK-NEXT: ccmp x9, x8, #0, eq
+; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: ret
+ %xor0 = xor i42 %b0, %a0
+ %xor1 = xor i42 %b1, %a1
+ %xor2 = xor i42 %b2, %a2
+ %or0 = or i42 %xor0, %xor1
+ %or1 = or i42 %or0, %xor2
+ %r = icmp ne i42 %or1, 0
+ ret i1 %r
+}
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
index 40794ee1627e2..60bd2a373a2c1 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
@@ -191,10 +191,7 @@ define i32 @combine_setcc_glue(i128 noundef %x, i128 noundef %y) {
; CHECK-LABEL: combine_setcc_glue:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp x0, x2
-; CHECK-NEXT: cset w8, eq
-; CHECK-NEXT: ccmp x1, x3, #0, eq
-; CHECK-NEXT: cset w9, eq
-; CHECK-NEXT: orr w0, w9, w8
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
entry:
%cmp3 = icmp eq i128 %x, %y
@@ -218,11 +215,12 @@ define [2 x i64] @PR58675(i128 %a.addr, i128 %b.addr) {
; CHECK-NEXT: csel x10, x0, x8, lo
; CHECK-NEXT: cmp x1, x9
; CHECK-NEXT: csel x8, x0, x8, lo
-; CHECK-NEXT: csel x8, x10, x8, eq
-; CHECK-NEXT: csel x10, x1, x9, lo
-; CHECK-NEXT: subs x8, x2, x8
-; CHECK-NEXT: sbc x9, x3, x10
-; CHECK-NEXT: ccmp x3, x10, #0, eq
+; CHECK-NEXT: csel x11, x1, x9, lo
+; CHECK-NEXT: csel x10, x10, x8, eq
+; CHECK-NEXT: subs x8, x2, x10
+; CHECK-NEXT: sbc x9, x3, x11
+; CHECK-NEXT: cmp x3, x11
+; CHECK-NEXT: ccmp x2, x10, #0, eq
; CHECK-NEXT: b.ne .LBB12_1
; CHECK-NEXT: // %bb.2: // %do.end
; CHECK-NEXT: mov x0, xzr
diff --git a/llvm/test/CodeGen/AArch64/i128-cmp.ll b/llvm/test/CodeGen/AArch64/i128-cmp.ll
index b50a559434302..9c23b3a603d10 100644
--- a/llvm/test/CodeGen/AArch64/i128-cmp.ll
+++ b/llvm/test/CodeGen/AArch64/i128-cmp.ll
@@ -116,8 +116,8 @@ define i1 @cmp_i128_sle(i128 %a, i128 %b) {
define void @br_on_cmp_i128_eq(i128 %a, i128 %b) nounwind {
; CHECK-LABEL: br_on_cmp_i128_eq:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmp x0, x2
-; CHECK-NEXT: ccmp x1, x3, #0, eq
+; CHECK-NEXT: cmp x1, x3
+; CHECK-NEXT: ccmp x0, x2, #0, eq
; CHECK-NEXT: b.ne .LBB10_2
; CHECK-NEXT: // %bb.1: // %call
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -137,8 +137,8 @@ exit:
define void @br_on_cmp_i128_ne(i128 %a, i128 %b) nounwind {
; CHECK-LABEL: br_on_cmp_i128_ne:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmp x0, x2
-; CHECK-NEXT: ccmp x1, x3, #0, eq
+; CHECK-NEXT: cmp x1, x3
+; CHECK-NEXT: ccmp x0, x2, #0, eq
; CHECK-NEXT: b.eq .LBB11_2
; CHECK-NEXT: // %bb.1: // %call
; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
More information about the llvm-commits
mailing list