[llvm] a4516da - [AArch64] - Fold and and cmp into tst (#110347)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 3 09:56:08 PDT 2024
Author: Jorge Botto
Date: 2024-10-03T17:56:01+01:00
New Revision: a4516da49f8bda1b99d21dae7e1caba772d7182c
URL: https://github.com/llvm/llvm-project/commit/a4516da49f8bda1b99d21dae7e1caba772d7182c
DIFF: https://github.com/llvm/llvm-project/commit/a4516da49f8bda1b99d21dae7e1caba772d7182c.diff
LOG: [AArch64] - Fold and and cmp into tst (#110347)
Fixes https://github.com/llvm/llvm-project/issues/102703.
https://godbolt.org/z/nfj8xsb1Y
The following pattern:
```
%2 = and i32 %0, 254
%3 = icmp eq i32 %2, 0
```
is optimised by instcombine into:
```%3 = icmp ult i32 %0, 2```
However, post instcombine leads to worse aarch64 than the unoptimised version.
Pre instcombine:
```
tst w0, #0xfe
cset w0, eq
ret
```
Post instcombine:
```
and w8, w0, #0xff
cmp w8, #2
cset w0, lo
ret
```
In the unoptimised version, SelectionDAG converts `SETCC (AND X 254) 0 EQ` into `CSEL 0 1 1 (ANDS X 254)`, which gets emitted as a `tst`.
In the optimised version, SelectionDAG converts `SETCC (AND X 255) 2 ULT` into `CSEL 0 1 2 (SUBS (AND X 255) 2)`, which gets emitted as an `and`/`cmp`.
This PR adds an optimisation to `AArch64ISelLowering`, converting `SETCC (AND X Y) Z ULT` into `SETCC (AND X (Y & ~(Z - 1))) 0 EQ` when `Z` is a power of two. This makes SelectionDAG/Codegen produce the same optimised code for both examples.
Added:
llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/signed-truncation-check.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e55e9989e6565c..48e1b96d841efb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4301,6 +4301,29 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
Op.getOperand(1));
}
+// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
+// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
+// (AND X Y) Z which produces a better opt with EmitComparison
+static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS,
+ SelectionDAG &DAG, const SDLoc dl) {
+ if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
+ ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+ ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+ if (LHSConstOp && RHSConst) {
+ uint64_t LHSConstValue = LHSConstOp->getZExtValue();
+ uint64_t RHSConstant = RHSConst->getZExtValue();
+ if (isPowerOf2_64(RHSConstant)) {
+ uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
+ LHS =
+ DAG.getNode(ISD::AND, dl, LHS.getValueType(), LHS.getOperand(0),
+ DAG.getConstant(NewMaskValue, dl, LHS.getValueType()));
+ RHS = DAG.getConstant(0, dl, RHS.getValueType());
+ CC = ISD::SETEQ;
+ }
+ }
+ }
+}
+
SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
@@ -10596,6 +10619,9 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
}
if (LHS.getValueType().isInteger()) {
+
+ simplifySetCCIntoEq(CC, LHS, RHS, DAG, dl);
+
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(
LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
diff --git a/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll
new file mode 100644
index 00000000000000..33c5ba7987974a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/icmp-ult-eq-fold.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+
+define i1 @lt8_u8(i8 %0) {
+; CHECK-LABEL: lt8_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0xf8
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = icmp ult i8 %0, 8
+ ret i1 %2
+}
+
+define i1 @lt32_u8(i8 %0) {
+; CHECK-LABEL: lt32_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0xe0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = icmp ult i8 %0, 32
+ ret i1 %2
+}
+
+define i1 @lt64_u8(i8 %0) {
+; CHECK-LABEL: lt64_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0xc0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = icmp ult i8 %0, 64
+ ret i1 %2
+}
+
+define i1 @lt8_u32(i32 %0) {
+; CHECK-LABEL: lt8_u32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp w0, #8
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %2 = icmp ult i32 %0, 8
+ ret i1 %2
+}
+
+define i1 @lt32_u32(i32 %0) {
+; CHECK-LABEL: lt32_u32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp w0, #32
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %2 = icmp ult i32 %0, 32
+ ret i1 %2
+}
+
+define i1 @lt64_u32(i32 %0) {
+; CHECK-LABEL: lt64_u32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp w0, #64
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %2 = icmp ult i32 %0, 64
+ ret i1 %2
+}
+
+define i1 @lt8_u64(i64 %0) {
+; CHECK-LABEL: lt8_u64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, #8
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %2 = icmp ult i64 %0, 8
+ ret i1 %2
+}
+
+define i1 @lt32_u64(i64 %0) {
+; CHECK-LABEL: lt32_u64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, #32
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %2 = icmp ult i64 %0, 32
+ ret i1 %2
+}
+
+define i1 @lt64_u64(i64 %0) {
+; CHECK-LABEL: lt64_u64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, #64
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %2 = icmp ult i64 %0, 64
+ ret i1 %2
+}
+
+define i1 @lt8_u16_and_5(i8 %0) {
+; CHECK-LABEL: lt8_u16_and_5:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = and i8 %0, 5
+ %3 = icmp ult i8 %2, 16
+ ret i1 %3
+}
+
+define i1 @lt8_u16_and_19(i8 %0) {
+; CHECK-LABEL: lt8_u16_and_19:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0x10
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = and i8 %0, 19
+ %3 = icmp ult i8 %2, 16
+ ret i1 %3
+}
+
+define i1 @lt32_u16_and_7(i32 %0) {
+; CHECK-LABEL: lt32_u16_and_7:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, wzr
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = and i32 %0, 7
+ %3 = icmp ult i32 %2, 16
+ ret i1 %3
+}
+
+define i1 @lt32_u16_and_21(i32 %0) {
+; CHECK-LABEL: lt32_u16_and_21:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst w0, #0x10
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = and i32 %0, 21
+ %3 = icmp ult i32 %2, 16
+ ret i1 %3
+}
+
+define i1 @lt64_u16_and_9(i64 %0) {
+; CHECK-LABEL: lt64_u16_and_9:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = and i64 %0, 9
+ %3 = icmp ult i64 %2, 16
+ ret i1 %3
+}
+
+define i1 @lt64_u16_and_23(i64 %0) {
+; CHECK-LABEL: lt64_u16_and_23:
+; CHECK: // %bb.0:
+; CHECK-NEXT: tst x0, #0x10
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %2 = and i64 %0, 23
+ %3 = icmp ult i64 %2, 16
+ ret i1 %3
+}
+
+; negative test
+define i1 @lt3_u8(i8 %0) {
+; CHECK-LABEL: lt3_u8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and w8, w0, #0xff
+; CHECK-NEXT: cmp w8, #3
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %2 = icmp ult i8 %0, 3
+ ret i1 %2
+}
+
+; negative test
+define i1 @lt3_u32(i32 %0) {
+; CHECK-LABEL: lt3_u32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp w0, #3
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %2 = icmp ult i32 %0, 3
+ ret i1 %2
+}
+
+; negative test
+define i1 @lt3_u64(i64 %0) {
+; CHECK-LABEL: lt3_u64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, #3
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %2 = icmp ult i64 %0, 3
+ ret i1 %2
+}
+
+; negative test
+define i32 @lt32_u16_multiple_use(i32 %0) {
+; CHECK-LABEL: lt32_u16_multiple_use:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #21 // =0x15
+; CHECK-NEXT: mov w9, #10 // =0xa
+; CHECK-NEXT: and w8, w0, w8
+; CHECK-NEXT: cmp w8, #16
+; CHECK-NEXT: orr w8, w8, w9
+; CHECK-NEXT: cset w10, lo
+; CHECK-NEXT: mul w0, w8, w10
+; CHECK-NEXT: ret
+ %2 = and i32 %0, 21
+ %3 = icmp ult i32 %2, 16
+ %4 = add i32 %2, 10
+ %5 = zext i1 %3 to i32
+ %6 = mul i32 %4, %5
+ ret i32 %6
+}
diff --git a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
index bb4df6d8935b1b..7c80f9320faec1 100644
--- a/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
+++ b/llvm/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -287,9 +287,8 @@ define i1 @add_ultcmp_bad_i16_i8_add(i16 %x, i16 %y) nounwind {
; CHECK-LABEL: add_ultcmp_bad_i16_i8_add:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, w1
-; CHECK-NEXT: and w8, w8, #0xffff
-; CHECK-NEXT: cmp w8, #256
-; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: tst w8, #0xff00
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%tmp0 = add i16 %x, %y
%tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
@@ -328,9 +327,8 @@ define i1 @add_ultcmp_bad_i16_i8_c0notpoweroftwo(i16 %x) nounwind {
; CHECK-LABEL: add_ultcmp_bad_i16_i8_c0notpoweroftwo:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, #192
-; CHECK-NEXT: and w8, w8, #0xffff
-; CHECK-NEXT: cmp w8, #256
-; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: tst w8, #0xff00
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%tmp0 = add i16 %x, 192 ; (1U << (8-1)) + (1U << (8-1-1))
%tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
@@ -356,9 +354,8 @@ define i1 @add_ultcmp_bad_i16_i8_magic(i16 %x) nounwind {
; CHECK-LABEL: add_ultcmp_bad_i16_i8_magic:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, #64
-; CHECK-NEXT: and w8, w8, #0xffff
-; CHECK-NEXT: cmp w8, #256
-; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: tst w8, #0xff00
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%tmp0 = add i16 %x, 64 ; 1U << (8-1-1)
%tmp1 = icmp ult i16 %tmp0, 256 ; 1U << 8
@@ -370,9 +367,8 @@ define i1 @add_ultcmp_bad_i16_i4(i16 %x) nounwind {
; CHECK-LABEL: add_ultcmp_bad_i16_i4:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, #8
-; CHECK-NEXT: and w8, w8, #0xffff
-; CHECK-NEXT: cmp w8, #16
-; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: tst w8, #0xfff0
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%tmp0 = add i16 %x, 8 ; 1U << (4-1)
%tmp1 = icmp ult i16 %tmp0, 16 ; 1U << 4
@@ -384,9 +380,8 @@ define i1 @add_ultcmp_bad_i24_i8(i24 %x) nounwind {
; CHECK-LABEL: add_ultcmp_bad_i24_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, #128
-; CHECK-NEXT: and w8, w8, #0xffffff
-; CHECK-NEXT: cmp w8, #256
-; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: tst w8, #0xffff00
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%tmp0 = add i24 %x, 128 ; 1U << (8-1)
%tmp1 = icmp ult i24 %tmp0, 256 ; 1U << 8
More information about the llvm-commits
mailing list