[llvm] [AArch64] Improve lowering of scalar abs(sub(a, b)). (PR #151180)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 30 09:16:43 PDT 2025
https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/151180
>From 5f4c086c253b8e1974d253b1e51d925578fa1731 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Mon, 28 Jul 2025 07:32:49 -0700
Subject: [PATCH 1/2] [AArch64] Improve lowering of scalar abs(sub(a, b)).
This patch avoids a comparison against zero when lowering abs(sub(a, b))
patterns, instead reusing the condition codes generated by a subs of the
operands directly.
For example, currently:
```
sxtb w8, w0
sub w8, w8, w1, sxtb
cmp w8, #0
cneg w0, w8, pl
```
becomes:
```
sxtb w8, w0
sxtb w9, w1
subs w8, w9, w8
cneg w0, w8, gt
```
Whilst this doesn't decrease the number of instructions utilised, the
new version exposes more ILP and uses ``cheaper'' instructions,
typically having lower latency and/or higher throughput.
This patch also includes a somewhat orthogonal change in
performNegCSelCombine, which I included to avoid a code generation
regression in CodeGen/AArch64/abd[su]-neg.ll due to the combine
negating the operands of the csel, leading to an extra sub instruction.
If preferable, I can open a separate PR for this.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 33 ++++++---
llvm/test/CodeGen/AArch64/abds-neg.ll | 30 ++++----
llvm/test/CodeGen/AArch64/abds.ll | 74 +++++++++----------
llvm/test/CodeGen/AArch64/abdu-neg.ll | 30 ++++----
llvm/test/CodeGen/AArch64/abdu.ll | 74 +++++++++----------
5 files changed, 128 insertions(+), 113 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7b49754ee7e1f..8b515fb649ee7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7118,12 +7118,21 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
SDLoc DL(Op);
- SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
- Op.getOperand(0));
- // Generate SUBS & CSEL.
- SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
- Op.getOperand(0), DAG.getConstant(0, DL, VT));
- return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
+ SDValue Val = Op.getOperand(0);
+ SDValue Neg = DAG.getNegative(Val, DL, VT);
+ SDValue Cmp;
+
+ // For abs(sub(lhs, rhs)), we can compare lhs and rhs directly. This allows
+ // reusing the subs operation for the calculation and comparison.
+ if (Val.getOpcode() == ISD::SUB)
+ Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
+ Val.getOperand(0), Val.getOperand(1));
+ else
+ // Otherwise, compare with zero.
+ Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Val,
+ DAG.getConstant(0, DL, VT));
+
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, Val, Neg,
DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
Cmp.getValue(1));
}
@@ -20835,11 +20844,17 @@ static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
return SDValue();
- SDValue N0N = getNegatedInteger(N0, DAG);
- SDValue N1N = getNegatedInteger(N1, DAG);
-
SDLoc DL(N);
EVT VT = CSel.getValueType();
+
+ // If the operands are negations of each other, reverse them.
+ if ((isNegatedInteger(N0) && N0.getOperand(1) == N1) ||
+ (isNegatedInteger(N1) && N1.getOperand(1) == N0))
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, N1, N0, CSel.getOperand(2),
+ CSel.getOperand(3));
+
+ SDValue N0N = getNegatedInteger(N0, DAG);
+ SDValue N1N = getNegatedInteger(N1, DAG);
return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
CSel.getOperand(3));
}
diff --git a/llvm/test/CodeGen/AArch64/abds-neg.ll b/llvm/test/CodeGen/AArch64/abds-neg.ll
index 432ffc30eec5e..a7c27894e6125 100644
--- a/llvm/test/CodeGen/AArch64/abds-neg.ll
+++ b/llvm/test/CodeGen/AArch64/abds-neg.ll
@@ -8,9 +8,9 @@
define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_ext_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxtb w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxtb w8, w1
+; CHECK-NEXT: sxtb w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, pl
; CHECK-NEXT: ret
%aext = sext i8 %a to i64
@@ -25,9 +25,9 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_ext_i8_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxtb w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxth w8, w1
+; CHECK-NEXT: sxtb w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, pl
; CHECK-NEXT: ret
%aext = sext i8 %a to i64
@@ -42,9 +42,9 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_ext_i8_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxtb w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxtb w8, w1
+; CHECK-NEXT: sxtb w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, pl
; CHECK-NEXT: ret
%aext = sext i8 %a to i64
@@ -59,9 +59,9 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_ext_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxth w8, w1
+; CHECK-NEXT: sxth w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, pl
; CHECK-NEXT: ret
%aext = sext i16 %a to i64
@@ -94,9 +94,9 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_ext_i16_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxth w8, w1
+; CHECK-NEXT: sxth w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, pl
; CHECK-NEXT: ret
%aext = sext i16 %a to i64
diff --git a/llvm/test/CodeGen/AArch64/abds.ll b/llvm/test/CodeGen/AArch64/abds.ll
index ed1e6077948ee..3e1d8522aa9d8 100644
--- a/llvm/test/CodeGen/AArch64/abds.ll
+++ b/llvm/test/CodeGen/AArch64/abds.ll
@@ -8,9 +8,9 @@
define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_ext_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxtb w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxtb w8, w1
+; CHECK-NEXT: sxtb w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%aext = sext i8 %a to i64
@@ -24,9 +24,9 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_ext_i8_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxtb w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxth w8, w1
+; CHECK-NEXT: sxtb w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%aext = sext i8 %a to i64
@@ -40,9 +40,9 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_ext_i8_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxtb w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxtb w8, w1
+; CHECK-NEXT: sxtb w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%aext = sext i8 %a to i64
@@ -56,9 +56,9 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_ext_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxth w8, w1
+; CHECK-NEXT: sxth w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%aext = sext i16 %a to i64
@@ -88,9 +88,9 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_ext_i16_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxth w8, w1
+; CHECK-NEXT: sxth w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%aext = sext i16 %a to i64
@@ -220,9 +220,9 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_minmax_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxtb w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxtb w8, w1
+; CHECK-NEXT: sxtb w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%min = call i8 @llvm.smin.i8(i8 %a, i8 %b)
@@ -234,9 +234,9 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_minmax_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxth w8, w1
+; CHECK-NEXT: sxth w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%min = call i16 @llvm.smin.i16(i16 %a, i16 %b)
@@ -294,9 +294,9 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_cmp_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxtb w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxtb w8, w1
+; CHECK-NEXT: sxtb w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%cmp = icmp sgt i8 %a, %b
@@ -309,9 +309,9 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_cmp_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxth w8, w1
+; CHECK-NEXT: sxth w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%cmp = icmp sge i16 %a, %b
@@ -517,11 +517,11 @@ define i64 @vector_legalized(i16 %a, i16 %b) {
; CHECK-LABEL: vector_legalized:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxth
-; CHECK-NEXT: addp d0, v0.2d
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxth w8, w1
+; CHECK-NEXT: sxth w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w8, w8, mi
+; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x9, d0
; CHECK-NEXT: add x0, x9, x8
; CHECK-NEXT: ret
@@ -542,9 +542,9 @@ define i64 @vector_legalized(i16 %a, i16 %b) {
define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_select_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxtb w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxtb w8, w1
+; CHECK-NEXT: sxtb w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%cmp = icmp slt i8 %a, %b
@@ -557,9 +557,9 @@ define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_select_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sxth w8, w0
-; CHECK-NEXT: sub w8, w8, w1, sxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: sxth w8, w1
+; CHECK-NEXT: sxth w9, w0
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%cmp = icmp sle i16 %a, %b
diff --git a/llvm/test/CodeGen/AArch64/abdu-neg.ll b/llvm/test/CodeGen/AArch64/abdu-neg.ll
index 8fb106e92866e..a83323650330b 100644
--- a/llvm/test/CodeGen/AArch64/abdu-neg.ll
+++ b/llvm/test/CodeGen/AArch64/abdu-neg.ll
@@ -8,9 +8,9 @@
define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_ext_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: sub w8, w8, w1, uxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xff
+; CHECK-NEXT: and w9, w0, #0xff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, pl
; CHECK-NEXT: ret
%aext = zext i8 %a to i64
@@ -25,9 +25,9 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_ext_i8_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: sub w8, w8, w1, uxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xffff
+; CHECK-NEXT: and w9, w0, #0xff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, pl
; CHECK-NEXT: ret
%aext = zext i8 %a to i64
@@ -42,9 +42,9 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_ext_i8_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: sub w8, w8, w1, uxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xff
+; CHECK-NEXT: and w9, w0, #0xff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, pl
; CHECK-NEXT: ret
%aext = zext i8 %a to i64
@@ -59,9 +59,9 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_ext_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xffff
-; CHECK-NEXT: sub w8, w8, w1, uxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xffff
+; CHECK-NEXT: and w9, w0, #0xffff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, pl
; CHECK-NEXT: ret
%aext = zext i16 %a to i64
@@ -94,9 +94,9 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_ext_i16_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xffff
-; CHECK-NEXT: sub w8, w8, w1, uxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xffff
+; CHECK-NEXT: and w9, w0, #0xffff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, pl
; CHECK-NEXT: ret
%aext = zext i16 %a to i64
diff --git a/llvm/test/CodeGen/AArch64/abdu.ll b/llvm/test/CodeGen/AArch64/abdu.ll
index 4585de96c848f..a058121236a5b 100644
--- a/llvm/test/CodeGen/AArch64/abdu.ll
+++ b/llvm/test/CodeGen/AArch64/abdu.ll
@@ -8,9 +8,9 @@
define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_ext_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: sub w8, w8, w1, uxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xff
+; CHECK-NEXT: and w9, w0, #0xff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%aext = zext i8 %a to i64
@@ -24,9 +24,9 @@ define i8 @abd_ext_i8(i8 %a, i8 %b) nounwind {
define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_ext_i8_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: sub w8, w8, w1, uxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xffff
+; CHECK-NEXT: and w9, w0, #0xff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%aext = zext i8 %a to i64
@@ -40,9 +40,9 @@ define i8 @abd_ext_i8_i16(i8 %a, i16 %b) nounwind {
define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_ext_i8_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: sub w8, w8, w1, uxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xff
+; CHECK-NEXT: and w9, w0, #0xff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%aext = zext i8 %a to i64
@@ -56,9 +56,9 @@ define i8 @abd_ext_i8_undef(i8 %a, i8 %b) nounwind {
define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_ext_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xffff
-; CHECK-NEXT: sub w8, w8, w1, uxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xffff
+; CHECK-NEXT: and w9, w0, #0xffff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%aext = zext i16 %a to i64
@@ -88,9 +88,9 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind {
define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_ext_i16_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xffff
-; CHECK-NEXT: sub w8, w8, w1, uxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xffff
+; CHECK-NEXT: and w9, w0, #0xffff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%aext = zext i16 %a to i64
@@ -224,9 +224,9 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_minmax_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: sub w8, w8, w1, uxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xff
+; CHECK-NEXT: and w9, w0, #0xff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%min = call i8 @llvm.umin.i8(i8 %a, i8 %b)
@@ -238,9 +238,9 @@ define i8 @abd_minmax_i8(i8 %a, i8 %b) nounwind {
define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_minmax_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xffff
-; CHECK-NEXT: sub w8, w8, w1, uxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xffff
+; CHECK-NEXT: and w9, w0, #0xffff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%min = call i16 @llvm.umin.i16(i16 %a, i16 %b)
@@ -300,9 +300,9 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_cmp_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: sub w8, w8, w1, uxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xff
+; CHECK-NEXT: and w9, w0, #0xff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%cmp = icmp ugt i8 %a, %b
@@ -315,9 +315,9 @@ define i8 @abd_cmp_i8(i8 %a, i8 %b) nounwind {
define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_cmp_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xffff
-; CHECK-NEXT: sub w8, w8, w1, uxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xffff
+; CHECK-NEXT: and w9, w0, #0xffff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%cmp = icmp uge i16 %a, %b
@@ -382,11 +382,11 @@ define i64 @vector_legalized(i16 %a, i16 %b) {
; CHECK-LABEL: vector_legalized:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: and w8, w0, #0xffff
-; CHECK-NEXT: sub w8, w8, w1, uxth
-; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: and w8, w1, #0xffff
+; CHECK-NEXT: and w9, w0, #0xffff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w8, w8, mi
+; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x9, d0
; CHECK-NEXT: add x0, x9, x8
; CHECK-NEXT: ret
@@ -407,9 +407,9 @@ define i64 @vector_legalized(i16 %a, i16 %b) {
define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: abd_select_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xff
-; CHECK-NEXT: sub w8, w8, w1, uxtb
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xff
+; CHECK-NEXT: and w9, w0, #0xff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%cmp = icmp ult i8 %a, %b
@@ -422,9 +422,9 @@ define i8 @abd_select_i8(i8 %a, i8 %b) nounwind {
define i16 @abd_select_i16(i16 %a, i16 %b) nounwind {
; CHECK-LABEL: abd_select_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: and w8, w0, #0xffff
-; CHECK-NEXT: sub w8, w8, w1, uxth
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and w8, w1, #0xffff
+; CHECK-NEXT: and w9, w0, #0xffff
+; CHECK-NEXT: subs w8, w9, w8
; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%cmp = icmp ule i16 %a, %b
>From ce097ff69a2383d0db47526c2cb7d90144a8c4ad Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Wed, 30 Jul 2025 08:48:23 -0700
Subject: [PATCH 2/2] Combine subs(sub(a,b), 0) -> subs(a,b) in
performCSELCombine.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 39 ++++++++++++-------
1 file changed, 24 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8b515fb649ee7..8ea16bb1004a6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7118,21 +7118,12 @@ SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- SDValue Neg = DAG.getNegative(Val, DL, VT);
- SDValue Cmp;
-
- // For abs(sub(lhs, rhs)), we can compare lhs and rhs directly. This allows
- // reusing the subs operation for the calculation and comparison.
- if (Val.getOpcode() == ISD::SUB)
- Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
- Val.getOperand(0), Val.getOperand(1));
- else
- // Otherwise, compare with zero.
- Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Val,
- DAG.getConstant(0, DL, VT));
-
- return DAG.getNode(AArch64ISD::CSEL, DL, VT, Val, Neg,
+ SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+ Op.getOperand(0));
+ // Generate SUBS & CSEL.
+ SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
+ Op.getOperand(0), DAG.getConstant(0, DL, VT));
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
Cmp.getValue(1));
}
@@ -25399,6 +25390,24 @@ static SDValue performCSELCombine(SDNode *N,
}
}
+ // CSEL a, b, cc, SUBS(SUB(x,y), 0) -> CSEL a, b, cc, SUBS(x,y) if cc doesn't
+ // use overflow flags to avoid the comparison with zero.
+ if (Cond.getOpcode() == AArch64ISD::SUBS &&
+ isNullConstant(Cond.getOperand(1))) {
+ SDValue Sub = Cond.getOperand(0);
+ AArch64CC::CondCode CC =
+ static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
+ if (Sub.getOpcode() == ISD::SUB &&
+ (CC == AArch64CC::EQ || CC == AArch64CC::NE || CC == AArch64CC::MI ||
+ CC == AArch64CC::PL)) {
+ SDLoc DL(N);
+ SDValue Subs = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
+ Sub.getOperand(0), Sub.getOperand(1));
+ return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
+ N->getOperand(1), N->getOperand(2), Subs.getValue(1));
+ }
+ }
+
// CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
if (SDValue CondLast = foldCSELofLASTB(N, DAG))
return CondLast;
More information about the llvm-commits
mailing list