[llvm] fd67992 - [DAGCombine] fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1))
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 13 08:35:18 PDT 2022
Author: Philip Reames
Date: 2022-07-13T08:34:38-07:00
New Revision: fd67992f9c4b811e8db7aa58d8ad53223b089c3f
URL: https://github.com/llvm/llvm-project/commit/fd67992f9c4b811e8db7aa58d8ad53223b089c3f
DIFF: https://github.com/llvm/llvm-project/commit/fd67992f9c4b811e8db7aa58d8ad53223b089c3f.diff
LOG: [DAGCombine] fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1))
We have the same fold in InstCombine - though implemented via OrZero flag on isKnownToBePowerOfTwo. The reasoning here is that either a) the result of the lshr is a power-of-two, or b) we have a div-by-zero triggering UB which we can ignore.
Differential Revision: https://reviews.llvm.org/D129606
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/X86/combine-urem.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ca3fa55706db4..f2c94a8707dc8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4593,9 +4593,12 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
AddToWorklist(Add.getNode());
return DAG.getNode(ISD::AND, DL, VT, N0, Add);
}
- if (N1.getOpcode() == ISD::SHL &&
+ // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+ // fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1))
+ // TODO: We should sink the following into isKnownToBePowerOfTwo
+ // using a OrZero parameter analogous to our handling in ValueTracking.
+ if ((N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) &&
DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
- // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
AddToWorklist(Add.getNode());
diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll
index 7be358e661fd7..434f262746303 100644
--- a/llvm/test/CodeGen/X86/combine-urem.ll
+++ b/llvm/test/CodeGen/X86/combine-urem.ll
@@ -361,41 +361,25 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_urem_by_lshr_pow2a:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [4,4,4,4]
-; SSE-NEXT: movdqa %xmm5, %xmm2
-; SSE-NEXT: psrld %xmm4, %xmm2
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: psrld %xmm3, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: psrld %xmm3, %xmm6
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,3,3,3,4,5,6,7]
-; SSE-NEXT: psrld %xmm1, %xmm5
-; SSE-NEXT: pextrd $1, %xmm5, %ecx
-; SSE-NEXT: pextrd $1, %xmm0, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: divl %ecx
-; SSE-NEXT: movl %edx, %ecx
-; SSE-NEXT: movd %xmm6, %esi
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: divl %esi
-; SSE-NEXT: movd %edx, %xmm1
-; SSE-NEXT: pinsrd $1, %ecx, %xmm1
-; SSE-NEXT: pextrd $2, %xmm4, %ecx
-; SSE-NEXT: pextrd $2, %xmm0, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: divl %ecx
-; SSE-NEXT: pinsrd $2, %edx, %xmm1
-; SSE-NEXT: pextrd $3, %xmm2, %ecx
-; SSE-NEXT: pextrd $3, %xmm0, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: divl %ecx
-; SSE-NEXT: pinsrd $3, %edx, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,4,4,4]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrld %xmm2, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm6
+; SSE-NEXT: psrld %xmm5, %xmm6
+; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrld %xmm1, %xmm4
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE-NEXT: psrld %xmm1, %xmm3
+; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_urem_by_lshr_pow2a:
@@ -403,61 +387,28 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) {
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4]
; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrld %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: xorl %edx, %edx
-; AVX1-NEXT: divl %ecx
-; AVX1-NEXT: movl %edx, %ecx
-; AVX1-NEXT: vmovd %xmm5, %esi
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: xorl %edx, %edx
-; AVX1-NEXT: divl %esi
-; AVX1-NEXT: vmovd %edx, %xmm1
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: vpextrd $2, %xmm4, %ecx
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: xorl %edx, %edx
-; AVX1-NEXT: divl %ecx
-; AVX1-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
-; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: xorl %edx, %edx
-; AVX1-NEXT: divl %ecx
-; AVX1-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_urem_by_lshr_pow2a:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
-; AVX2-NEXT: vpextrd $1, %xmm0, %eax
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: divl %ecx
-; AVX2-NEXT: movl %edx, %ecx
-; AVX2-NEXT: vmovd %xmm1, %esi
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: divl %esi
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT: vpextrd $2, %xmm0, %eax
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: divl %ecx
-; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT: vpextrd $3, %xmm0, %eax
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: divl %ecx
-; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%1 = lshr <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y
%2 = urem <4 x i32> %x, %1
@@ -467,41 +418,25 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_urem_by_lshr_pow2b:
; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1,4,8,16]
-; SSE-NEXT: movdqa %xmm5, %xmm2
-; SSE-NEXT: psrld %xmm4, %xmm2
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: psrld %xmm3, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: psrld %xmm3, %xmm6
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,3,3,3,4,5,6,7]
-; SSE-NEXT: psrld %xmm1, %xmm5
-; SSE-NEXT: pextrd $1, %xmm5, %ecx
-; SSE-NEXT: pextrd $1, %xmm0, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: divl %ecx
-; SSE-NEXT: movl %edx, %ecx
-; SSE-NEXT: movd %xmm6, %esi
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: divl %esi
-; SSE-NEXT: movd %edx, %xmm1
-; SSE-NEXT: pinsrd $1, %ecx, %xmm1
-; SSE-NEXT: pextrd $2, %xmm4, %ecx
-; SSE-NEXT: pextrd $2, %xmm0, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: divl %ecx
-; SSE-NEXT: pinsrd $2, %edx, %xmm1
-; SSE-NEXT: pextrd $3, %xmm2, %ecx
-; SSE-NEXT: pextrd $3, %xmm0, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: divl %ecx
-; SSE-NEXT: pinsrd $3, %edx, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,4,8,16]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrld %xmm2, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm6
+; SSE-NEXT: psrld %xmm5, %xmm6
+; SSE-NEXT: pblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrld %xmm1, %xmm4
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
+; SSE-NEXT: psrld %xmm1, %xmm3
+; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: paddd %xmm3, %xmm1
+; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_urem_by_lshr_pow2b:
@@ -509,61 +444,28 @@ define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) {
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,8,16]
; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
+; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrld %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; AVX1-NEXT: xorl %edx, %edx
-; AVX1-NEXT: divl %ecx
-; AVX1-NEXT: movl %edx, %ecx
-; AVX1-NEXT: vmovd %xmm5, %esi
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: xorl %edx, %edx
-; AVX1-NEXT: divl %esi
-; AVX1-NEXT: vmovd %edx, %xmm1
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: vpextrd $2, %xmm4, %ecx
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; AVX1-NEXT: xorl %edx, %edx
-; AVX1-NEXT: divl %ecx
-; AVX1-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
-; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: xorl %edx, %edx
-; AVX1-NEXT: divl %ecx
-; AVX1-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_urem_by_lshr_pow2b:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,8,16]
; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
-; AVX2-NEXT: vpextrd $1, %xmm0, %eax
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: divl %ecx
-; AVX2-NEXT: movl %edx, %ecx
-; AVX2-NEXT: vmovd %xmm1, %esi
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: divl %esi
-; AVX2-NEXT: vmovd %edx, %xmm2
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT: vpextrd $2, %xmm0, %eax
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: divl %ecx
-; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT: vpextrd $3, %xmm0, %eax
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: divl %ecx
-; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%1 = lshr <4 x i32> <i32 1, i32 4, i32 8, i32 16>, %y
%2 = urem <4 x i32> %x, %1
More information about the llvm-commits
mailing list