[llvm] e981e6d - Add transform for `(and/or (icmp eq/ne A,-1),(icmp eq/ne A,-1+C))`->`(and/or (icmp eq/ne (and ~A,-1+C),0))`
Noah Goldstein via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 24 13:22:36 PST 2023
Author: Noah Goldstein
Date: 2023-02-24T15:22:09-06:00
New Revision: e981e6d10ed323b8cb9ea1f82723bf3009f0bfc3
URL: https://github.com/llvm/llvm-project/commit/e981e6d10ed323b8cb9ea1f82723bf3009f0bfc3
DIFF: https://github.com/llvm/llvm-project/commit/e981e6d10ed323b8cb9ea1f82723bf3009f0bfc3.diff
LOG: Add transform for `(and/or (icmp eq/ne A,-1),(icmp eq/ne A,-1+C))`->`(and/or (icmp eq/ne (and ~A,-1+C),0))`
This works of `-1+C` is a negative power of 2.
This can be more useful than the `AddAnd` case as `~A` does not
necessarily require materializing a constant. This makes the transform
worth it for X86 vector types.
Alive2 Links:
EQ: https://alive2.llvm.org/ce/z/P6u8cq
NE: https://alive2.llvm.org/ce/z/_Kkqp1
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D144284
Added:
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/icmp-pow2-diff.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2afb05a8c14c3..f0e250c9fcbce 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -284,10 +284,11 @@ class TargetLoweringBase {
/// Enum of
diff erent potentially desirable ways to fold (and/or (setcc ...),
/// (setcc ...)).
- enum class AndOrSETCCFoldKind {
- None,
- AddAnd,
- ABS,
+ enum AndOrSETCCFoldKind : uint8_t {
+ None = 0, // No fold is preferable.
+ AddAnd = 1, // Fold with `Add` op and `And` op is preferable.
+ NotAnd = 2, // Fold with `Not` op and `And` op is preferable.
+ ABS = 4, // Fold with `llvm.abs` op is preferable.
};
class ArgListEntry {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f610e1c96a273..63a3061cb1b77 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5926,7 +5926,7 @@ static SDValue foldAndOrOfSETCC(SDNode *LogicOp, SelectionDAG &DAG) {
// Preference is to use ISD::ABS or we already have an ISD::ABS (in which
// case this is just a compare).
if (APLhs == (-APRhs) &&
- (TargetPreference == AndOrSETCCFoldKind::ABS ||
+ ((TargetPreference & AndOrSETCCFoldKind::ABS) ||
DAG.doesNodeExist(ISD::ABS, DAG.getVTList(OpVT), {LHS0}))) {
const APInt &C = APLhs.isNegative() ? APRhs : APLhs;
// (icmp eq A, C) | (icmp eq A, -C)
@@ -5936,23 +5936,45 @@ static SDValue foldAndOrOfSETCC(SDNode *LogicOp, SelectionDAG &DAG) {
SDValue AbsOp = DAG.getNode(ISD::ABS, DL, OpVT, LHS0);
return DAG.getNode(ISD::SETCC, DL, VT, AbsOp,
DAG.getConstant(C, DL, OpVT), LHS.getOperand(2));
- } else if (TargetPreference == AndOrSETCCFoldKind::AddAnd) {
+ } else if (TargetPreference &
+ (AndOrSETCCFoldKind::AddAnd | AndOrSETCCFoldKind::NotAnd)) {
+
+ // AndOrSETCCFoldKind::AddAnd:
// A == C0 | A == C1
// IF IsPow2(smax(C0, C1)-smin(C0, C1))
// -> ((A - smin(C0, C1)) & ~(smax(C0, C1)-smin(C0, C1))) == 0
// A != C0 & A != C1
// IF IsPow2(smax(C0, C1)-smin(C0, C1))
// -> ((A - smin(C0, C1)) & ~(smax(C0, C1)-smin(C0, C1))) != 0
+
+ // AndOrSETCCFoldKind::NotAnd:
+ // A == C0 | A == C1
+ // IF smax(C0, C1) == -1 AND IsPow2(smax(C0, C1) - smin(C0, C1))
+ // -> ~A & smin(C0, C1) == 0
+ // A != C0 & A != C1
+ // IF smax(C0, C1) == -1 AND IsPow2(smax(C0, C1) - smin(C0, C1))
+ // -> ~A & smin(C0, C1) != 0
+
const APInt &MaxC = APIntOps::smax(APRhs, APLhs);
const APInt &MinC = APIntOps::smin(APRhs, APLhs);
APInt Dif = MaxC - MinC;
if (!Dif.isZero() && Dif.isPowerOf2()) {
- SDValue AddOp = DAG.getNode(ISD::ADD, DL, OpVT, LHS0,
- DAG.getConstant(-MinC, DL, OpVT));
- SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, AddOp,
- DAG.getConstant(~Dif, DL, OpVT));
- return DAG.getNode(ISD::SETCC, DL, VT, AndOp,
- DAG.getConstant(0, DL, OpVT), LHS.getOperand(2));
+ if (MaxC.isAllOnes() &&
+ (TargetPreference & AndOrSETCCFoldKind::NotAnd)) {
+ SDValue NotOp = DAG.getNOT(DL, LHS0, OpVT);
+ SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, NotOp,
+ DAG.getConstant(MinC, DL, OpVT));
+ return DAG.getNode(ISD::SETCC, DL, VT, AndOp,
+ DAG.getConstant(0, DL, OpVT), LHS.getOperand(2));
+ } else if (TargetPreference & AndOrSETCCFoldKind::AddAnd) {
+
+ SDValue AddOp = DAG.getNode(ISD::ADD, DL, OpVT, LHS0,
+ DAG.getConstant(-MinC, DL, OpVT));
+ SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, AddOp,
+ DAG.getConstant(~Dif, DL, OpVT));
+ return DAG.getNode(ISD::SETCC, DL, VT, AndOp,
+ DAG.getConstant(0, DL, OpVT), LHS.getOperand(2));
+ }
}
}
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49b79e1dc67a0..53a276e7b8d07 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57260,9 +57260,18 @@ X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(
EVT OpVT = SETCC0->getOperand(0).getValueType();
if (!VT.isInteger())
return AndOrSETCCFoldKind::None;
+
if (VT.isVector())
- return isOperationLegal(ISD::ABS, OpVT) ? AndOrSETCCFoldKind::ABS
- : AndOrSETCCFoldKind::None;
+ return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |
+ (isOperationLegal(ISD::ABS, OpVT)
+ ? AndOrSETCCFoldKind::ABS
+ : AndOrSETCCFoldKind::None));
+
+ // Don't use `NotAnd` as even though `not` is generally shorter code size than
+ // `add`, `add` can lower to LEA which can save moves / spills. Any case where
+ // `NotAnd` applies, `AddAnd` does as well.
+ // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
+ // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
return AndOrSETCCFoldKind::AddAnd;
}
diff --git a/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll b/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll
index 5c7c3e9cff5b1..d5ad852537be3 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll
@@ -39,28 +39,24 @@ define <4 x i1> @andnot_eq_v4i32_todo_no_splat(<4 x i32> %x) nounwind {
define <4 x i1> @andnot_eq_v4i32(<4 x i32> %x) nounwind {
; AVX512-LABEL: andnot_eq_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
-; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1
-; AVX512-NEXT: korw %k1, %k0, %k1
-; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX2-LABEL: andnot_eq_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967287,4294967287,4294967287,4294967287]
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967287,4294967287,4294967287,4294967287]
+; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; SSE-LABEL: andnot_eq_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
; SSE-NEXT: retq
%cmp1 = icmp eq <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%cmp2 = icmp eq <4 x i32> %x, <i32 -9, i32 -9, i32 -9, i32 -9>
@@ -115,41 +111,32 @@ define <2 x i1> @andnot_eq_v2i64_fail_max_not_n1(<2 x i64> %x) nounwind {
define <2 x i1> @andnot_eq_v2i64(<2 x i64> %x) nounwind {
; AVX512-LABEL: andnot_eq_v2i64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0
-; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
-; AVX512-NEXT: korw %k1, %k0, %k1
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; AVX512-NEXT: vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX2-LABEL: andnot_eq_v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; SSE41-LABEL: andnot_eq_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551611,18446744073709551611]
-; SSE41-NEXT: pcmpeqq %xmm0, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: pcmpeqq %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE2-LABEL: andnot_eq_v2i64:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551611,18446744073709551611]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
%cmp1 = icmp eq <2 x i64> %x, <i64 -5, i64 -5>
%cmp2 = icmp eq <2 x i64> %x, <i64 -1, i64 -1>
@@ -195,30 +182,28 @@ define <8 x i1> @andnot_ne_v8i16_todo_no_splat(<8 x i16> %x) nounwind {
define <8 x i1> @andnot_ne_v8i16(<8 x i16> %x) nounwind {
; AVX512-LABEL: andnot_ne_v8i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq $18, %xmm2, %xmm1, %xmm0
+; AVX512-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX2-LABEL: andnot_ne_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; SSE-LABEL: andnot_ne_v8i16:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [49151,49151,49151,49151,49151,49151,49151,49151]
-; SSE-NEXT: pcmpeqw %xmm0, %xmm1
-; SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: pcmpeqw %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
%cmp1 = icmp ne <8 x i16> %x, <i16 -16385, i16 -16385, i16 -16385, i16 -16385, i16 -16385, i16 -16385, i16 -16385, i16 -16385>
%cmp2 = icmp ne <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -264,30 +249,28 @@ define <16 x i1> @andnot_ne_v16i8_fail_max_not_n1(<16 x i8> %x) nounwind {
define <16 x i1> @andnot_ne_v16i8(<16 x i8> %x) nounwind {
; AVX512-LABEL: andnot_ne_v16i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq $18, %xmm1, %xmm2, %xmm0
+; AVX512-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX2-LABEL: andnot_ne_v16i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0
; AVX2-NEXT: retq
;
; SSE-LABEL: andnot_ne_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: pcmpeqb %xmm2, %xmm1
-; SSE-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
%cmp1 = icmp ne <16 x i8> %x, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%cmp2 = icmp ne <16 x i8> %x, <i8 -33, i8 -33, i8 -33, i8 -33, i8 -33, i8 -33, i8 -33, i8 -33, i8 -33, i8 -33, i8 -33, i8 -33, i8 -33, i8 -33, i8 -33, i8 -33>
More information about the llvm-commits
mailing list