[llvm] 6aab826 - [DAGCombiner] add fold (xor (smin(x, C), C)) and fold (xor (smax(x, C), C)) (#155141)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 16 08:31:02 PDT 2025
Author: guan jian
Date: 2025-09-16T15:30:57Z
New Revision: 6aab826e2334fe454c5e2349d871f16d7d011a4c
URL: https://github.com/llvm/llvm-project/commit/6aab826e2334fe454c5e2349d871f16d7d011a4c
DIFF: https://github.com/llvm/llvm-project/commit/6aab826e2334fe454c5e2349d871f16d7d011a4c.diff
LOG: [DAGCombiner] add fold (xor (smin(x, C), C)) and fold (xor (smax(x, C), C)) (#155141)
Hi, I compared the following LLVM IR with GCC and Clang, and there is a small difference between the two. The LLVM IR is:
```
define i64 @test_smin_neg_one(i64 %a) {
%1 = tail call i64 @llvm.smin.i64(i64 %a, i64 -1)
%retval.0 = xor i64 %1, -1
ret i64 %retval.0
}
```
GCC generates:
```
cmp x0, 0
csinv x0, xzr, x0, ge
ret
```
Clang generates:
```
cmn x0, #1
csinv x8, x0, xzr, lt
mvn x0, x8
ret
```
Clang keeps flipping x0 through x8 unnecessarily.
So I added the following folds to DAGCombiner:
fold (xor (smax(x, C), C)) -> select (x > C), xor(x, C), 0
fold (xor (smin(x, C), C)) -> select (x < C), xor(x, C), 0
alive2: https://alive2.llvm.org/ce/z/gffoir
---------
Co-authored-by: Yui5427 <785369607 at qq.com>
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
Co-authored-by: Simon Pilgrim <llvm-dev at redking.me.uk>
Added:
llvm/test/CodeGen/AArch64/xor-min-max.ll
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d130efe96b56b..4b20b756f8a15 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10092,6 +10092,55 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
return Combined;
+ // fold (xor (smin(x, C), C)) -> select (x < C), xor(x, C), 0
+ // fold (xor (smax(x, C), C)) -> select (x > C), xor(x, C), 0
+ // fold (xor (umin(x, C), C)) -> select (x < C), xor(x, C), 0
+ // fold (xor (umax(x, C), C)) -> select (x > C), xor(x, C), 0
+ SDValue Op0;
+ if (sd_match(N0, m_OneUse(m_AnyOf(m_SMin(m_Value(Op0), m_Specific(N1)),
+ m_SMax(m_Value(Op0), m_Specific(N1)),
+ m_UMin(m_Value(Op0), m_Specific(N1)),
+ m_UMax(m_Value(Op0), m_Specific(N1)))))) {
+
+ if (isa<ConstantSDNode>(N1) ||
+ ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
+ // For vectors, only optimize when the constant is zero or all-ones to
+ // avoid generating more instructions
+ if (VT.isVector()) {
+ ConstantSDNode *N1C = isConstOrConstSplat(N1);
+ if (!N1C || (!N1C->isZero() && !N1C->isAllOnes()))
+ return SDValue();
+ }
+
+ // Avoid the fold if the minmax operation is legal and select is expensive
+ if (TLI.isOperationLegal(N0.getOpcode(), VT) &&
+ TLI.isPredictableSelectExpensive())
+ return SDValue();
+
+ EVT CCVT = getSetCCResultType(VT);
+ ISD::CondCode CC;
+ switch (N0.getOpcode()) {
+ case ISD::SMIN:
+ CC = ISD::SETLT;
+ break;
+ case ISD::SMAX:
+ CC = ISD::SETGT;
+ break;
+ case ISD::UMIN:
+ CC = ISD::SETULT;
+ break;
+ case ISD::UMAX:
+ CC = ISD::SETUGT;
+ break;
+ }
+ SDValue FN1 = DAG.getFreeze(N1);
+ SDValue Cmp = DAG.getSetCC(DL, CCVT, Op0, FN1, CC);
+ SDValue XorXC = DAG.getNode(ISD::XOR, DL, VT, Op0, FN1);
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ return DAG.getSelect(DL, VT, Cmp, XorXC, Zero);
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/xor-min-max.ll b/llvm/test/CodeGen/AArch64/xor-min-max.ll
new file mode 100644
index 0000000000000..2d6696e1c556e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/xor-min-max.ll
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 | FileCheck %s
+
+; Test for DAGCombiner optimization: fold (xor (smin(x, C), C)) -> select (x < C), xor (x, C), 0
+
+define i64 @test_smin_neg_one(i64 %a) {
+; CHECK-LABEL: test_smin_neg_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmn x0, #1
+; CHECK-NEXT: csinv x0, xzr, x0, ge
+; CHECK-NEXT: ret
+ %1 = tail call i64 @llvm.smin.i64(i64 %a, i64 -1)
+ %retval.0 = xor i64 %1, -1
+ ret i64 %retval.0
+}
+
+define i64 @test_smin_constant(i64 %a) {
+; CHECK-LABEL: test_smin_constant:
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor x8, x0, #0x8
+; CHECK-NEXT: cmp x0, #8
+; CHECK-NEXT: csel x0, x8, xzr, lt
+; CHECK-NEXT: ret
+ %1 = tail call i64 @llvm.smin.i64(i64 %a, i64 8)
+ %retval.0 = xor i64 %1, 8
+ ret i64 %retval.0
+}
+
+; Test for DAGCombiner optimization: fold (xor (smax(x, C), C)) -> select (x > C), xor (x, C), 0
+define i64 @test_smax_neg_one(i64 %a) {
+; CHECK-LABEL: test_smax_neg_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn x8, x0
+; CHECK-NEXT: bic x0, x8, x0, asr #63
+; CHECK-NEXT: ret
+ %1 = tail call i64 @llvm.smax.i64(i64 %a, i64 -1)
+ %retval.0 = xor i64 %1, -1
+ ret i64 %retval.0
+}
+
+define i64 @test_smax_constant(i64 %a) {
+; CHECK-LABEL: test_smax_constant:
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor x8, x0, #0x8
+; CHECK-NEXT: cmp x0, #8
+; CHECK-NEXT: csel x0, x8, xzr, gt
+; CHECK-NEXT: ret
+ %1 = tail call i64 @llvm.smax.i64(i64 %a, i64 8)
+ %retval.0 = xor i64 %1, 8
+ ret i64 %retval.0
+}
+
+define i64 @test_umin_neg_one(i64 %a) {
+; CHECK-LABEL: test_umin_neg_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn x0, x0
+; CHECK-NEXT: ret
+ %1 = tail call i64 @llvm.umin.i64(i64 %a, i64 -1)
+ %retval.0 = xor i64 %1, -1
+ ret i64 %retval.0
+}
+
+define i64 @test_umin_constant(i64 %a) {
+; CHECK-LABEL: test_umin_constant:
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor x8, x0, #0x8
+; CHECK-NEXT: cmp x0, #8
+; CHECK-NEXT: csel x0, x8, xzr, lo
+; CHECK-NEXT: ret
+ %1 = tail call i64 @llvm.umin.i64(i64 %a, i64 8)
+ %retval.0 = xor i64 %1, 8
+ ret i64 %retval.0
+}
+
+define i64 @test_umax_neg_one(i64 %a) {
+; CHECK-LABEL: test_umax_neg_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov x0, xzr
+; CHECK-NEXT: ret
+ %1 = tail call i64 @llvm.umax.i64(i64 %a, i64 -1)
+ %retval.0 = xor i64 %1, -1
+ ret i64 %retval.0
+}
+
+define i64 @test_umax_constant(i64 %a) {
+; CHECK-LABEL: test_umax_constant:
+; CHECK: // %bb.0:
+; CHECK-NEXT: eor x8, x0, #0x8
+; CHECK-NEXT: cmp x0, #8
+; CHECK-NEXT: csel x0, x8, xzr, hi
+; CHECK-NEXT: ret
+ %1 = tail call i64 @llvm.umax.i64(i64 %a, i64 8)
+ %retval.0 = xor i64 %1, 8
+ ret i64 %retval.0
+}
+
+; Test vector cases
+define <4 x i32> @test_smin_vector_neg_one(<4 x i32> %a) {
+; CHECK-LABEL: test_smin_vector_neg_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
+; CHECK-NEXT: cmgt v1.4s, v1.4s, v0.4s
+; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+ %1 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+ %retval.0 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+ ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_smin_vector_constant(<4 x i32> %a) {
+; CHECK-LABEL: test_smin_vector_constant:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.4s, #8
+; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %1 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
+ %retval.0 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
+ ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_smax_vector_neg_one(<4 x i32> %a) {
+; CHECK-LABEL: test_smax_vector_neg_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v1.4s, v0.4s, #0
+; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+ %1 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+ %retval.0 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+ ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_smax_vector_constant(<4 x i32> %a) {
+; CHECK-LABEL: test_smax_vector_constant:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.4s, #8
+; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %1 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
+ %retval.0 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
+ ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_umin_vector_neg_one(<4 x i32> %a) {
+; CHECK-LABEL: test_umin_vector_neg_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: ret
+ %1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+ %retval.0 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+ ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_umin_vector_constant(<4 x i32> %a) {
+; CHECK-LABEL: test_umin_vector_constant:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.4s, #8
+; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
+ %retval.0 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
+ ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_umax_vector_neg_one(<4 x i32> %a) {
+; CHECK-LABEL: test_umax_vector_neg_one:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: ret
+ %1 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+ %retval.0 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+ ret <4 x i32> %retval.0
+}
+
+define <4 x i32> @test_umax_vector_constant(<4 x i32> %a) {
+; CHECK-LABEL: test_umax_vector_constant:
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v1.4s, #8
+; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: ret
+ %1 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
+ %retval.0 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
+ ret <4 x i32> %retval.0
+}
+
+declare i64 @llvm.smin.i64(i64, i64)
+declare i64 @llvm.smax.i64(i64, i64)
+declare i64 @llvm.umin.i64(i64, i64)
+declare i64 @llvm.umax.i64(i64, i64)
+declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
More information about the llvm-commits
mailing list