[llvm] cc7d966 - Add coverage for missing (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1)) transform
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 12 16:05:12 PDT 2022
Author: Philip Reames
Date: 2022-07-12T16:04:57-07:00
New Revision: cc7d9665111614b431e485cb4ba90685c73654ed
URL: https://github.com/llvm/llvm-project/commit/cc7d9665111614b431e485cb4ba90685c73654ed
DIFF: https://github.com/llvm/llvm-project/commit/cc7d9665111614b431e485cb4ba90685c73654ed.diff
LOG: Add coverage for missing (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1)) transform
Added:
Modified:
llvm/test/CodeGen/X86/combine-urem.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll
index def64fe194cdf..7be358e661fd7 100644
--- a/llvm/test/CodeGen/X86/combine-urem.ll
+++ b/llvm/test/CodeGen/X86/combine-urem.ll
@@ -357,6 +357,219 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
ret <4 x i32> %2
}
+; fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1))
+define <4 x i32> @combine_vec_urem_by_lshr_pow2a(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: combine_vec_urem_by_lshr_pow2a:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [4,4,4,4]
+; SSE-NEXT: movdqa %xmm5, %xmm2
+; SSE-NEXT: psrld %xmm4, %xmm2
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE-NEXT: movdqa %xmm5, %xmm4
+; SSE-NEXT: psrld %xmm3, %xmm4
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
+; SSE-NEXT: movdqa %xmm5, %xmm6
+; SSE-NEXT: psrld %xmm3, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,3,3,3,4,5,6,7]
+; SSE-NEXT: psrld %xmm1, %xmm5
+; SSE-NEXT: pextrd $1, %xmm5, %ecx
+; SSE-NEXT: pextrd $1, %xmm0, %eax
+; SSE-NEXT: xorl %edx, %edx
+; SSE-NEXT: divl %ecx
+; SSE-NEXT: movl %edx, %ecx
+; SSE-NEXT: movd %xmm6, %esi
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: xorl %edx, %edx
+; SSE-NEXT: divl %esi
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: pinsrd $1, %ecx, %xmm1
+; SSE-NEXT: pextrd $2, %xmm4, %ecx
+; SSE-NEXT: pextrd $2, %xmm0, %eax
+; SSE-NEXT: xorl %edx, %edx
+; SSE-NEXT: divl %ecx
+; SSE-NEXT: pinsrd $2, %edx, %xmm1
+; SSE-NEXT: pextrd $3, %xmm2, %ecx
+; SSE-NEXT: pextrd $3, %xmm0, %eax
+; SSE-NEXT: xorl %edx, %edx
+; SSE-NEXT: divl %ecx
+; SSE-NEXT: pinsrd $3, %edx, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: combine_vec_urem_by_lshr_pow2a:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4]
+; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsrld %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
+; AVX1-NEXT: xorl %edx, %edx
+; AVX1-NEXT: divl %ecx
+; AVX1-NEXT: movl %edx, %ecx
+; AVX1-NEXT: vmovd %xmm5, %esi
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: xorl %edx, %edx
+; AVX1-NEXT: divl %esi
+; AVX1-NEXT: vmovd %edx, %xmm1
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: vpextrd $2, %xmm4, %ecx
+; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: xorl %edx, %edx
+; AVX1-NEXT: divl %ecx
+; AVX1-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
+; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
+; AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; AVX1-NEXT: xorl %edx, %edx
+; AVX1-NEXT: divl %ecx
+; AVX1-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_vec_urem_by_lshr_pow2a:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
+; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
+; AVX2-NEXT: vpextrd $1, %xmm0, %eax
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: divl %ecx
+; AVX2-NEXT: movl %edx, %ecx
+; AVX2-NEXT: vmovd %xmm1, %esi
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: divl %esi
+; AVX2-NEXT: vmovd %edx, %xmm2
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX2-NEXT: vpextrd $2, %xmm0, %eax
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: divl %ecx
+; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
+; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
+; AVX2-NEXT: vpextrd $3, %xmm0, %eax
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: divl %ecx
+; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
+; AVX2-NEXT: retq
+ %1 = lshr <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y
+ %2 = urem <4 x i32> %x, %1
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_vec_urem_by_lshr_pow2b(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: combine_vec_urem_by_lshr_pow2b:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1,4,8,16]
+; SSE-NEXT: movdqa %xmm5, %xmm2
+; SSE-NEXT: psrld %xmm4, %xmm2
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE-NEXT: movdqa %xmm5, %xmm4
+; SSE-NEXT: psrld %xmm3, %xmm4
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
+; SSE-NEXT: movdqa %xmm5, %xmm6
+; SSE-NEXT: psrld %xmm3, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,3,3,3,4,5,6,7]
+; SSE-NEXT: psrld %xmm1, %xmm5
+; SSE-NEXT: pextrd $1, %xmm5, %ecx
+; SSE-NEXT: pextrd $1, %xmm0, %eax
+; SSE-NEXT: xorl %edx, %edx
+; SSE-NEXT: divl %ecx
+; SSE-NEXT: movl %edx, %ecx
+; SSE-NEXT: movd %xmm6, %esi
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: xorl %edx, %edx
+; SSE-NEXT: divl %esi
+; SSE-NEXT: movd %edx, %xmm1
+; SSE-NEXT: pinsrd $1, %ecx, %xmm1
+; SSE-NEXT: pextrd $2, %xmm4, %ecx
+; SSE-NEXT: pextrd $2, %xmm0, %eax
+; SSE-NEXT: xorl %edx, %edx
+; SSE-NEXT: divl %ecx
+; SSE-NEXT: pinsrd $2, %edx, %xmm1
+; SSE-NEXT: pextrd $3, %xmm2, %ecx
+; SSE-NEXT: pextrd $3, %xmm0, %eax
+; SSE-NEXT: xorl %edx, %edx
+; SSE-NEXT: divl %ecx
+; SSE-NEXT: pinsrd $3, %edx, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: combine_vec_urem_by_lshr_pow2b:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,8,16]
+; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX1-NEXT: vpsrld %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsrld %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpextrd $1, %xmm1, %ecx
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
+; AVX1-NEXT: xorl %edx, %edx
+; AVX1-NEXT: divl %ecx
+; AVX1-NEXT: movl %edx, %ecx
+; AVX1-NEXT: vmovd %xmm5, %esi
+; AVX1-NEXT: vmovd %xmm0, %eax
+; AVX1-NEXT: xorl %edx, %edx
+; AVX1-NEXT: divl %esi
+; AVX1-NEXT: vmovd %edx, %xmm1
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; AVX1-NEXT: vpextrd $2, %xmm4, %ecx
+; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: xorl %edx, %edx
+; AVX1-NEXT: divl %ecx
+; AVX1-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
+; AVX1-NEXT: vpextrd $3, %xmm2, %ecx
+; AVX1-NEXT: vpextrd $3, %xmm0, %eax
+; AVX1-NEXT: xorl %edx, %edx
+; AVX1-NEXT: divl %ecx
+; AVX1-NEXT: vpinsrd $3, %edx, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_vec_urem_by_lshr_pow2b:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,8,16]
+; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpextrd $1, %xmm1, %ecx
+; AVX2-NEXT: vpextrd $1, %xmm0, %eax
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: divl %ecx
+; AVX2-NEXT: movl %edx, %ecx
+; AVX2-NEXT: vmovd %xmm1, %esi
+; AVX2-NEXT: vmovd %xmm0, %eax
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: divl %esi
+; AVX2-NEXT: vmovd %edx, %xmm2
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX2-NEXT: vpextrd $2, %xmm0, %eax
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: divl %ecx
+; AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2
+; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
+; AVX2-NEXT: vpextrd $3, %xmm0, %eax
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: divl %ecx
+; AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0
+; AVX2-NEXT: retq
+ %1 = lshr <4 x i32> <i32 1, i32 4, i32 8, i32 16>, %y
+ %2 = urem <4 x i32> %x, %1
+ ret <4 x i32> %2
+}
+
; FIXME: PR55271 - urem(undef, 3) != undef
; Use PSLLI intrinsic to postpone the undef creation until after urem-by-constant expansion
define <4 x i32> @combine_vec_urem_undef_by_3(<4 x i32> %in) {
More information about the llvm-commits
mailing list