[llvm] 5b82543 - [DAGCombiner] Optimize 1-bit smulo to AND+SETNE.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 13 09:48:22 PST 2021
Author: Craig Topper
Date: 2021-03-13T09:39:36-08:00
New Revision: 5b825433d7854f2e79966606ddfb381806cc499c
URL: https://github.com/llvm/llvm-project/commit/5b825433d7854f2e79966606ddfb381806cc499c
DIFF: https://github.com/llvm/llvm-project/commit/5b825433d7854f2e79966606ddfb381806cc499c.diff
LOG: [DAGCombiner] Optimize 1-bit smulo to AND+SETNE.
A 1-bit smulo overflows is both inputs are -1 since the result
should be +1 which can't be represented in a signed 1 bit value.
We can detect this with an AND and a setcc. The multiply result
can also use the same AND.
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D97634
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/X86/vec_smulo.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 273b7ced3977..120c7f244c6a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4661,6 +4661,14 @@ SDValue DAGCombiner::visitMULO(SDNode *N) {
N->getVTList(), N0, N0);
if (IsSigned) {
+ // A 1 bit SMULO overflows if both inputs are 1.
+ if (VT.getScalarSizeInBits() == 1) {
+ SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, N1);
+ return CombineTo(N, And,
+ DAG.getSetCC(DL, CarryVT, And,
+ DAG.getConstant(0, DL, VT), ISD::SETNE));
+ }
+
// Multiplying n * m significant bits yields a result of n + m significant
// bits. If the total number of significant bits does not exceed the
// result bit width (minus 1), there is no overflow.
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index f9e56f7c7f5c..4fa3367521a5 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -3561,304 +3561,58 @@ define <4 x i32> @smulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
; SSE2-LABEL: smulo_v4i1:
; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $31, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: movmskps %xmm0, %eax
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslld $31, %xmm1
-; SSE2-NEXT: movmskps %xmm1, %eax
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movb %al, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: smulo_v4i1:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pslld $31, %xmm1
-; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pslld $31, %xmm0
+; SSSE3-NEXT: movmskps %xmm0, %eax
; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pmuludq %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pslld $31, %xmm1
-; SSSE3-NEXT: movmskps %xmm1, %eax
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: movb %al, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: smulo_v4i1:
; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $31, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: pslld $31, %xmm0
+; SSE41-NEXT: movmskps %xmm0, %eax
; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pslld $31, %xmm1
-; SSE41-NEXT: movmskps %xmm1, %eax
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm0, %xmm1
; SSE41-NEXT: movb %al, (%rdi)
-; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: smulo_v4i1:
; AVX: # %bb.0:
-; AVX-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX-NEXT: vmovmskps %xmm0, %eax
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpslld $31, %xmm0, %xmm1
-; AVX-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovmskps %xmm1, %eax
; AVX-NEXT: movb %al, (%rdi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: smulo_v4i1:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512F-NEXT: vptestmd %xmm0, %xmm0, %k0
-; AVX512F-NEXT: kshiftrw $3, %k0, %k1
-; AVX512F-NEXT: kmovw %k1, %r8d
-; AVX512F-NEXT: andb $1, %r8b
-; AVX512F-NEXT: negb %r8b
-; AVX512F-NEXT: vpslld $31, %xmm1, %xmm0
-; AVX512F-NEXT: vptestmd %xmm0, %xmm0, %k1
-; AVX512F-NEXT: kshiftrw $3, %k1, %k2
-; AVX512F-NEXT: kmovw %k2, %r9d
-; AVX512F-NEXT: andb $1, %r9b
-; AVX512F-NEXT: negb %r9b
-; AVX512F-NEXT: kshiftrw $2, %k0, %k2
-; AVX512F-NEXT: kmovw %k2, %r10d
-; AVX512F-NEXT: andb $1, %r10b
-; AVX512F-NEXT: negb %r10b
-; AVX512F-NEXT: kshiftrw $2, %k1, %k2
-; AVX512F-NEXT: kmovw %k2, %ebx
-; AVX512F-NEXT: andb $1, %bl
-; AVX512F-NEXT: negb %bl
-; AVX512F-NEXT: kshiftrw $1, %k0, %k2
-; AVX512F-NEXT: kmovw %k2, %ecx
-; AVX512F-NEXT: andb $1, %cl
-; AVX512F-NEXT: negb %cl
-; AVX512F-NEXT: kshiftrw $1, %k1, %k2
-; AVX512F-NEXT: kmovw %k2, %esi
-; AVX512F-NEXT: andb $1, %sil
-; AVX512F-NEXT: negb %sil
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: andb $1, %al
-; AVX512F-NEXT: negb %al
-; AVX512F-NEXT: kmovw %k1, %edx
-; AVX512F-NEXT: andb $1, %dl
-; AVX512F-NEXT: negb %dl
-; AVX512F-NEXT: # kill: def $al killed $al killed $eax
-; AVX512F-NEXT: mulb %dl
-; AVX512F-NEXT: movl %eax, %r11d
-; AVX512F-NEXT: andb $1, %al
-; AVX512F-NEXT: negb %al
-; AVX512F-NEXT: cmpb %r11b, %al
-; AVX512F-NEXT: setne %al
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: movw $-3, %ax
-; AVX512F-NEXT: kmovw %eax, %k0
-; AVX512F-NEXT: kandw %k0, %k1, %k1
-; AVX512F-NEXT: movl %ecx, %eax
-; AVX512F-NEXT: mulb %sil
-; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: andb $1, %al
-; AVX512F-NEXT: negb %al
-; AVX512F-NEXT: cmpb %cl, %al
-; AVX512F-NEXT: setne %al
-; AVX512F-NEXT: kmovw %eax, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $14, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k1, %k2
-; AVX512F-NEXT: movw $-5, %ax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kandw %k1, %k2, %k2
-; AVX512F-NEXT: movl %r10d, %eax
-; AVX512F-NEXT: mulb %bl
-; AVX512F-NEXT: movl %eax, %edx
-; AVX512F-NEXT: andb $1, %al
-; AVX512F-NEXT: negb %al
-; AVX512F-NEXT: cmpb %dl, %al
-; AVX512F-NEXT: setne %al
-; AVX512F-NEXT: kmovw %eax, %k3
-; AVX512F-NEXT: kshiftlw $2, %k3, %k3
-; AVX512F-NEXT: korw %k3, %k2, %k2
-; AVX512F-NEXT: kshiftlw $13, %k2, %k2
-; AVX512F-NEXT: kshiftrw $13, %k2, %k2
-; AVX512F-NEXT: movl %r8d, %eax
-; AVX512F-NEXT: mulb %r9b
-; AVX512F-NEXT: # kill: def $al killed $al def $eax
-; AVX512F-NEXT: movl %eax, %ebx
-; AVX512F-NEXT: andb $1, %bl
-; AVX512F-NEXT: negb %bl
-; AVX512F-NEXT: cmpb %al, %bl
-; AVX512F-NEXT: setne %sil
-; AVX512F-NEXT: kmovw %esi, %k3
-; AVX512F-NEXT: kshiftlw $3, %k3, %k3
-; AVX512F-NEXT: korw %k3, %k2, %k2
-; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512F-NEXT: andl $1, %r11d
-; AVX512F-NEXT: kmovw %r11d, %k2
-; AVX512F-NEXT: kandw %k0, %k2, %k0
-; AVX512F-NEXT: kmovw %ecx, %k2
-; AVX512F-NEXT: kshiftlw $15, %k2, %k2
-; AVX512F-NEXT: kshiftrw $14, %k2, %k2
-; AVX512F-NEXT: korw %k2, %k0, %k0
-; AVX512F-NEXT: kandw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw %edx, %k1
-; AVX512F-NEXT: kshiftlw $15, %k1, %k1
-; AVX512F-NEXT: kshiftrw $13, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
-; AVX512F-NEXT: movw $-9, %cx
-; AVX512F-NEXT: kmovw %ecx, %k1
-; AVX512F-NEXT: kandw %k1, %k0, %k0
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: kshiftlw $15, %k1, %k1
-; AVX512F-NEXT: kshiftrw $12, %k1, %k1
-; AVX512F-NEXT: korw %k1, %k0, %k0
+; AVX512F-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: movb %al, (%rdi)
-; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: smulo_v4i1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbx
+; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k0
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k1
-; AVX512BW-NEXT: kmovd %k1, %r8d
-; AVX512BW-NEXT: andb $1, %r8b
-; AVX512BW-NEXT: negb %r8b
-; AVX512BW-NEXT: vpslld $31, %xmm1, %xmm0
-; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k1
-; AVX512BW-NEXT: kshiftrw $3, %k1, %k2
-; AVX512BW-NEXT: kmovd %k2, %r9d
-; AVX512BW-NEXT: andb $1, %r9b
-; AVX512BW-NEXT: negb %r9b
-; AVX512BW-NEXT: kshiftrw $2, %k0, %k2
-; AVX512BW-NEXT: kmovd %k2, %r10d
-; AVX512BW-NEXT: andb $1, %r10b
-; AVX512BW-NEXT: negb %r10b
-; AVX512BW-NEXT: kshiftrw $2, %k1, %k2
-; AVX512BW-NEXT: kmovd %k2, %ebx
-; AVX512BW-NEXT: andb $1, %bl
-; AVX512BW-NEXT: negb %bl
-; AVX512BW-NEXT: kshiftrw $1, %k0, %k2
-; AVX512BW-NEXT: kmovd %k2, %ecx
-; AVX512BW-NEXT: andb $1, %cl
-; AVX512BW-NEXT: negb %cl
-; AVX512BW-NEXT: kshiftrw $1, %k1, %k2
-; AVX512BW-NEXT: kmovd %k2, %esi
-; AVX512BW-NEXT: andb $1, %sil
-; AVX512BW-NEXT: negb %sil
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: andb $1, %al
-; AVX512BW-NEXT: negb %al
-; AVX512BW-NEXT: kmovd %k1, %edx
-; AVX512BW-NEXT: andb $1, %dl
-; AVX512BW-NEXT: negb %dl
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: mulb %dl
-; AVX512BW-NEXT: movl %eax, %r11d
-; AVX512BW-NEXT: andb $1, %al
-; AVX512BW-NEXT: negb %al
-; AVX512BW-NEXT: cmpb %r11b, %al
-; AVX512BW-NEXT: setne %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: movw $-3, %ax
-; AVX512BW-NEXT: kmovd %eax, %k0
-; AVX512BW-NEXT: kandw %k0, %k1, %k1
-; AVX512BW-NEXT: movl %ecx, %eax
-; AVX512BW-NEXT: mulb %sil
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: andb $1, %al
-; AVX512BW-NEXT: negb %al
-; AVX512BW-NEXT: cmpb %cl, %al
-; AVX512BW-NEXT: setne %al
-; AVX512BW-NEXT: kmovd %eax, %k2
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $14, %k2, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k2
-; AVX512BW-NEXT: movw $-5, %ax
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: movl %r10d, %eax
-; AVX512BW-NEXT: mulb %bl
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: andb $1, %al
-; AVX512BW-NEXT: negb %al
-; AVX512BW-NEXT: cmpb %dl, %al
-; AVX512BW-NEXT: setne %al
-; AVX512BW-NEXT: kmovd %eax, %k3
-; AVX512BW-NEXT: kshiftlw $2, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kshiftlw $13, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $13, %k2, %k2
-; AVX512BW-NEXT: movl %r8d, %eax
-; AVX512BW-NEXT: mulb %r9b
-; AVX512BW-NEXT: # kill: def $al killed $al def $eax
-; AVX512BW-NEXT: movl %eax, %ebx
-; AVX512BW-NEXT: andb $1, %bl
-; AVX512BW-NEXT: negb %bl
-; AVX512BW-NEXT: cmpb %al, %bl
-; AVX512BW-NEXT: setne %sil
-; AVX512BW-NEXT: kmovd %esi, %k3
-; AVX512BW-NEXT: kshiftlw $3, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z}
-; AVX512BW-NEXT: andl $1, %r11d
-; AVX512BW-NEXT: kmovw %r11d, %k2
-; AVX512BW-NEXT: kandw %k0, %k2, %k0
-; AVX512BW-NEXT: kmovd %ecx, %k2
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $14, %k2, %k2
-; AVX512BW-NEXT: korw %k2, %k0, %k0
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovd %edx, %k1
-; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: movw $-9, %cx
-; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $12, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k0, %k0
+; AVX512BW-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
-; AVX512BW-NEXT: popq %rbx
; AVX512BW-NEXT: retq
%t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
%val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
More information about the llvm-commits
mailing list