[llvm] bf82070 - [SDAG] try to avoid multiply for X*Y==0
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 6 06:07:39 PST 2023
Author: Sanjay Patel
Date: 2023-01-06T09:06:11-05:00
New Revision: bf82070ea465969e9ae86a31dfcbf94c2a7b4c4c
URL: https://github.com/llvm/llvm-project/commit/bf82070ea465969e9ae86a31dfcbf94c2a7b4c4c
DIFF: https://github.com/llvm/llvm-project/commit/bf82070ea465969e9ae86a31dfcbf94c2a7b4c4c.diff
LOG: [SDAG] try to avoid multiply for X*Y==0
Forking this off from D140850 -
https://alive2.llvm.org/ce/z/TgBeK_
https://alive2.llvm.org/ce/z/STVD7d
We could almost justify doing this in IR, but consideration for
"minsize" requires that we only try it in codegen -- the
transform is not reversible.
In all other cases, avoiding multiply should be a win because a
mul is more expensive than simple/parallelizable compares. AArch
even has a trick to keep instruction count even for some types.
Differential Revision: https://reviews.llvm.org/D141086
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
llvm/test/CodeGen/AArch64/mul-cmp.ll
llvm/test/CodeGen/X86/mul-cmp.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a8bcfb1932f8d..fe5a0f382b673 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4207,6 +4207,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
SelectionDAG &DAG = DCI.DAG;
const DataLayout &Layout = DAG.getDataLayout();
EVT OpVT = N0.getValueType();
+ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
// Constant fold or commute setcc.
if (SDValue Fold = DAG.FoldSetCC(VT, N0, N1, Cond, dl))
@@ -4251,6 +4252,23 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
if (SDValue V = simplifySetCCWithCTPOP(*this, VT, N0, C1, Cond, dl, DAG))
return V;
+ // For equality to 0 of a no-wrap multiply, decompose and test each op:
+ // X * Y == 0 --> (X == 0) || (Y == 0)
+ // X * Y != 0 --> (X != 0) && (Y != 0)
+ // TODO: This bails out if minsize is set, but if the target doesn't have a
+ // single instruction multiply for this type, it would likely be
+ // smaller to decompose.
+ if (C1.isZero() && (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+ N0.getOpcode() == ISD::MUL && N0.hasOneUse() &&
+ (N0->getFlags().hasNoUnsignedWrap() ||
+ N0->getFlags().hasNoSignedWrap()) &&
+ !Attr.hasFnAttr(Attribute::MinSize)) {
+ SDValue IsXZero = DAG.getSetCC(dl, VT, N0.getOperand(0), N1, Cond);
+ SDValue IsYZero = DAG.getSetCC(dl, VT, N0.getOperand(1), N1, Cond);
+ unsigned LogicOp = Cond == ISD::SETEQ ? ISD::OR : ISD::AND;
+ return DAG.getNode(LogicOp, dl, VT, IsXZero, IsYZero);
+ }
+
// If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
// equality comparison, then we're just comparing whether X itself is
// zero.
@@ -5040,8 +5058,6 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
// Fold remainder of division by a constant.
if ((N0.getOpcode() == ISD::UREM || N0.getOpcode() == ISD::SREM) &&
N0.hasOneUse() && (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
- AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
-
// When division is cheap or optimizing for minimum size,
// fall through to DIVREM creation by skipping this fold.
if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttr(Attribute::MinSize)) {
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
index f02d3f1eb7507..8766a0f53d084 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll
@@ -191,13 +191,13 @@ define void @typei1_orig(i64 %a, ptr %p, ptr %q) {
; CHECK-NEXT: cmp x0, #0
; CHECK-NEXT: ldr q0, [x2]
; CHECK-NEXT: cset w8, gt
-; CHECK-NEXT: neg v0.8h, v0.8h
-; CHECK-NEXT: dup v1.8h, w8
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NEXT: movi v2.2d, #0000000000000000
; CHECK-NEXT: cmtst v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: dup v1.8h, w8
+; CHECK-NEXT: cmeq v1.8h, v1.8h, #0
+; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b
; CHECK-NEXT: xtn v0.8b, v0.8h
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: mov v0.d[1], v2.d[0]
; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
%tmp = xor <16 x i1> zeroinitializer, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
diff --git a/llvm/test/CodeGen/AArch64/mul-cmp.ll b/llvm/test/CodeGen/AArch64/mul-cmp.ll
index 0f0727b7b3c69..80d84a5ffd650 100644
--- a/llvm/test/CodeGen/AArch64/mul-cmp.ll
+++ b/llvm/test/CodeGen/AArch64/mul-cmp.ll
@@ -1,18 +1,26 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+; With no-wrap:
+; (X * Y) == 0 --> (X == 0) || (Y == 0)
+; (X * Y) != 0 --> (X != 0) && (Y != 0)
+
define i1 @mul_nsw_eq0_i8(i8 %x, i8 %y) {
; CHECK-LABEL: mul_nsw_eq0_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mul w8, w0, w1
-; CHECK-NEXT: tst w8, #0xff
-; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: tst w1, #0xff
+; CHECK-NEXT: cset w8, eq
+; CHECK-NEXT: tst w0, #0xff
+; CHECK-NEXT: cset w9, eq
+; CHECK-NEXT: orr w0, w9, w8
; CHECK-NEXT: ret
%m = mul nsw i8 %x, %y
%r = icmp eq i8 %m, 0
ret i1 %r
}
+; negative test - not valid if mul can overflow
+
define i1 @mul_eq0_i8(i8 %x, i8 %y) {
; CHECK-LABEL: mul_eq0_i8:
; CHECK: // %bb.0:
@@ -25,6 +33,8 @@ define i1 @mul_eq0_i8(i8 %x, i8 %y) {
ret i1 %r
}
+; negative test - don't try with minsize
+
define i1 @mul_nsw_eq0_i8_size(i8 %x, i8 %y) minsize {
; CHECK-LABEL: mul_nsw_eq0_i8_size:
; CHECK: // %bb.0:
@@ -40,9 +50,11 @@ define i1 @mul_nsw_eq0_i8_size(i8 %x, i8 %y) minsize {
define i1 @mul_nsw_ne0_i16(i16 %x, i16 %y) {
; CHECK-LABEL: mul_nsw_ne0_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: mul w8, w0, w1
-; CHECK-NEXT: tst w8, #0xffff
-; CHECK-NEXT: cset w0, ne
+; CHECK-NEXT: tst w1, #0xffff
+; CHECK-NEXT: cset w8, ne
+; CHECK-NEXT: tst w0, #0xffff
+; CHECK-NEXT: cset w9, ne
+; CHECK-NEXT: and w0, w9, w8
; CHECK-NEXT: ret
%m = mul nsw i16 %x, %y
%r = icmp ne i16 %m, 0
@@ -52,8 +64,8 @@ define i1 @mul_nsw_ne0_i16(i16 %x, i16 %y) {
define i1 @mul_nuw_eq0_i32(i32 %x, i32 %y) {
; CHECK-LABEL: mul_nuw_eq0_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mul w8, w0, w1
-; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: cmp w0, #0
+; CHECK-NEXT: ccmp w1, #0, #4, ne
; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%m = mul nuw i32 %x, %y
@@ -64,8 +76,8 @@ define i1 @mul_nuw_eq0_i32(i32 %x, i32 %y) {
define i1 @mul_nsw_nuw_ne0_i64(i64 %x, i64 %y) {
; CHECK-LABEL: mul_nsw_nuw_ne0_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: mul x8, x0, x1
-; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: cmp x0, #0
+; CHECK-NEXT: ccmp x1, #0, #4, ne
; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
%m = mul nsw nuw i64 %x, %y
@@ -76,8 +88,9 @@ define i1 @mul_nsw_nuw_ne0_i64(i64 %x, i64 %y) {
define <16 x i1> @mul_nuw_eq0_v16i8(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: mul_nuw_eq0_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: cmeq v1.16b, v1.16b, #0
; CHECK-NEXT: cmeq v0.16b, v0.16b, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%m = mul nuw <16 x i8> %x, %y
%r = icmp eq <16 x i8> %m, zeroinitializer
@@ -87,8 +100,9 @@ define <16 x i1> @mul_nuw_eq0_v16i8(<16 x i8> %x, <16 x i8> %y) {
define <4 x i1> @mul_nsw_ne0_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: mul_nsw_ne0_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s
+; CHECK-NEXT: cmeq v1.4s, v1.4s, #0
+; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b
; CHECK-NEXT: xtn v0.4h, v0.4s
; CHECK-NEXT: ret
%m = mul nsw <4 x i32> %x, %y
@@ -96,6 +110,8 @@ define <4 x i1> @mul_nsw_ne0_v4i32(<4 x i32> %x, <4 x i32> %y) {
ret <4 x i1> %r
}
+; negative test - don't try with minsize
+
define <4 x i1> @mul_nsw_ne0_v4i32_size(<4 x i32> %x, <4 x i32> %y) minsize {
; CHECK-LABEL: mul_nsw_ne0_v4i32_size:
; CHECK: // %bb.0:
diff --git a/llvm/test/CodeGen/X86/mul-cmp.ll b/llvm/test/CodeGen/X86/mul-cmp.ll
index d6c409af8a48e..4fffb42bdc672 100644
--- a/llvm/test/CodeGen/X86/mul-cmp.ll
+++ b/llvm/test/CodeGen/X86/mul-cmp.ll
@@ -2,20 +2,26 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=sse | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: llc < %s -mtriple=x86_64-- -mattr=avx | FileCheck %s --check-prefixes=CHECK,AVX
+; With no-wrap:
+; (X * Y) == 0 --> (X == 0) || (Y == 0)
+; (X * Y) != 0 --> (X != 0) && (Y != 0)
+
define i1 @mul_nsw_eq0_i8(i8 %x, i8 %y) {
; CHECK-LABEL: mul_nsw_eq0_i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: # kill: def $al killed $al killed $eax
-; CHECK-NEXT: mulb %sil
-; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: testb %sil, %sil
+; CHECK-NEXT: sete %cl
+; CHECK-NEXT: testb %dil, %dil
; CHECK-NEXT: sete %al
+; CHECK-NEXT: orb %cl, %al
; CHECK-NEXT: retq
%m = mul nsw i8 %x, %y
%r = icmp eq i8 %m, 0
ret i1 %r
}
+; negative test - not valid if mul can overflow
+
define i1 @mul_eq0_i8(i8 %x, i8 %y) {
; CHECK-LABEL: mul_eq0_i8:
; CHECK: # %bb.0:
@@ -30,6 +36,8 @@ define i1 @mul_eq0_i8(i8 %x, i8 %y) {
ret i1 %r
}
+; negative test - don't try with minsize
+
define i1 @mul_nsw_eq0_i8_size(i8 %x, i8 %y) minsize {
; CHECK-LABEL: mul_nsw_eq0_i8_size:
; CHECK: # %bb.0:
@@ -47,9 +55,11 @@ define i1 @mul_nsw_eq0_i8_size(i8 %x, i8 %y) minsize {
define i1 @mul_nsw_ne0_i16(i16 %x, i16 %y) {
; CHECK-LABEL: mul_nsw_ne0_i16:
; CHECK: # %bb.0:
-; CHECK-NEXT: imull %esi, %edi
+; CHECK-NEXT: testw %si, %si
+; CHECK-NEXT: setne %cl
; CHECK-NEXT: testw %di, %di
; CHECK-NEXT: setne %al
+; CHECK-NEXT: andb %cl, %al
; CHECK-NEXT: retq
%m = mul nsw i16 %x, %y
%r = icmp ne i16 %m, 0
@@ -59,9 +69,11 @@ define i1 @mul_nsw_ne0_i16(i16 %x, i16 %y) {
define i1 @mul_nuw_eq0_i32(i32 %x, i32 %y) {
; CHECK-LABEL: mul_nuw_eq0_i32:
; CHECK: # %bb.0:
-; CHECK-NEXT: imull %esi, %edi
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: sete %cl
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: sete %al
+; CHECK-NEXT: orb %cl, %al
; CHECK-NEXT: retq
%m = mul nuw i32 %x, %y
%r = icmp eq i32 %m, 0
@@ -71,9 +83,11 @@ define i1 @mul_nuw_eq0_i32(i32 %x, i32 %y) {
define i1 @mul_nsw_nuw_ne0_i64(i64 %x, i64 %y) {
; CHECK-LABEL: mul_nsw_nuw_ne0_i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: imulq %rsi, %rdi
+; CHECK-NEXT: testq %rsi, %rsi
+; CHECK-NEXT: setne %cl
; CHECK-NEXT: testq %rdi, %rdi
; CHECK-NEXT: setne %al
+; CHECK-NEXT: andb %cl, %al
; CHECK-NEXT: retq
%m = mul nsw nuw i64 %x, %y
%r = icmp ne i64 %m, 0
@@ -83,36 +97,18 @@ define i1 @mul_nsw_nuw_ne0_i64(i64 %x, i64 %y) {
define <16 x i1> @mul_nuw_eq0_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-LABEL: mul_nuw_eq0_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE-NEXT: pmullw %xmm2, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm3, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pcmpeqb %xmm2, %xmm1
+; SSE-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mul_nuw_eq0_v16i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%m = mul nuw <16 x i8> %x, %y
%r = icmp eq <16 x i8> %m, zeroinitializer
@@ -122,32 +118,31 @@ define <16 x i1> @mul_nuw_eq0_v16i8(<16 x i8> %x, <16 x i8> %y) {
define <4 x i1> @mul_nsw_ne0_v4i32(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: mul_nsw_ne0_v4i32:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE-NEXT: pxor %xmm1, %xmm2
+; SSE-NEXT: pandn %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mul_nsw_ne0_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%m = mul nsw <4 x i32> %x, %y
%r = icmp ne <4 x i32> %m, zeroinitializer
ret <4 x i1> %r
}
+; negative test - don't try with minsize
+; TODO: SSE would be much smaller if decomposed.
+
define <4 x i1> @mul_nsw_ne0_v4i32_size(<4 x i32> %x, <4 x i32> %y) minsize {
; SSE-LABEL: mul_nsw_ne0_v4i32_size:
; SSE: # %bb.0:
More information about the llvm-commits
mailing list