[llvm] cdf84c7 - [X86] Add test cases for v32i8 rotate with min-legal-vector-width=256
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 2 13:08:10 PDT 2020
Author: Craig Topper
Date: 2020-07-02T13:07:50-07:00
New Revision: cdf84c7b6b7a87949921ae23084f37ce74500800
URL: https://github.com/llvm/llvm-project/commit/cdf84c7b6b7a87949921ae23084f37ce74500800
DIFF: https://github.com/llvm/llvm-project/commit/cdf84c7b6b7a87949921ae23084f37ce74500800.diff
LOG: [X86] Add test cases for v32i8 rotate with min-legal-vector-width=256
We currently don't mark ROTL as custom when avx512bw is enabled
under the assumption we'll be able to promote the shifts in the
rotate idiom. But if we don't have 512-bit registers enabled we
can't promote.
Added:
Modified:
llvm/test/CodeGen/X86/min-legal-vector-width.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index ca4af0c75eda..2f06e6eb2000 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1639,3 +1639,173 @@ define void @cmp_v8i64_zext(<8 x i64>* %xptr, <8 x i64>* %yptr, <8 x i64>* %zptr
store <8 x i64> %ext, <8 x i64>* %zptr
ret void
}
+
+define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind "min-legal-vector-width"="256" {
+; CHECK-LABEL: var_rotate_v16i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; CHECK-NEXT: vpsubb %xmm1, %xmm2, %xmm2
+; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm1
+; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; CHECK-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpmovwb %ymm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
+ %shl = shl <16 x i8> %a, %b
+ %lshr = lshr <16 x i8> %a, %b8
+ %or = or <16 x i8> %shl, %lshr
+ ret <16 x i8> %or
+}
+
+define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" {
+; CHECK-LABEL: var_rotate_v32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllw $4, %ymm0, %ymm2
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; CHECK-NEXT: vpsllw $5, %ymm1, %ymm3
+; CHECK-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
+; CHECK-NEXT: vpsllw $2, %ymm2, %ymm4
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; CHECK-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; CHECK-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; CHECK-NEXT: vpsubb %ymm1, %ymm4, %ymm1
+; CHECK-NEXT: vpaddb %ymm2, %ymm2, %ymm4
+; CHECK-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; CHECK-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; CHECK-NEXT: vpsllw $5, %ymm1, %ymm1
+; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm3
+; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm4
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
+; CHECK-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; CHECK-NEXT: vpsrlw $2, %ymm0, %ymm1
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; CHECK-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpsrlw $1, %ymm0, %ymm1
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; CHECK-NEXT: vpaddb %ymm3, %ymm3, %ymm3
+; CHECK-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
+ %shl = shl <32 x i8> %a, %b
+ %lshr = lshr <32 x i8> %a, %b8
+ %or = or <32 x i8> %shl, %lshr
+ ret <32 x i8> %or
+}
+
+define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" {
+; CHECK-LABEL: splatvar_rotate_v32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpsubb %xmm1, %xmm2, %xmm1
+; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpsllw %xmm3, %ymm0, %ymm2
+; CHECK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpsllw %xmm3, %xmm4, %xmm3
+; CHECK-NEXT: vpbroadcastb %xmm3, %ymm3
+; CHECK-NEXT: vpand %ymm3, %ymm2, %ymm2
+; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
+; CHECK-NEXT: vpsrlw $8, %xmm1, %xmm1
+; CHECK-NEXT: vpbroadcastb %xmm1, %ymm1
+; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
+ %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
+ %shl = shl <32 x i8> %a, %splat
+ %lshr = lshr <32 x i8> %a, %splat8
+ %or = or <32 x i8> %shl, %lshr
+ ret <32 x i8> %or
+}
+
+define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
+; CHECK-AVX512-LABEL: constant_rotate_v32i8:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vpsllw $4, %ymm0, %ymm1
+; CHECK-AVX512-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; CHECK-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; CHECK-AVX512-NEXT: # ymm2 = mem[0,1,0,1]
+; CHECK-AVX512-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
+; CHECK-AVX512-NEXT: vpsllw $2, %ymm1, %ymm3
+; CHECK-AVX512-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
+; CHECK-AVX512-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; CHECK-AVX512-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; CHECK-AVX512-NEXT: vpaddb %ymm1, %ymm1, %ymm3
+; CHECK-AVX512-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; CHECK-AVX512-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; CHECK-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; CHECK-AVX512-NEXT: vpsllvw {{.*}}(%rip), %ymm3, %ymm3
+; CHECK-AVX512-NEXT: vpsrlw $8, %ymm3, %ymm3
+; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; CHECK-AVX512-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-AVX512-NEXT: vpsrlw $8, %ymm0, %ymm0
+; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; CHECK-AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-VBMI-LABEL: constant_rotate_v32i8:
+; CHECK-VBMI: # %bb.0:
+; CHECK-VBMI-NEXT: vpsllw $4, %ymm0, %ymm1
+; CHECK-VBMI-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; CHECK-VBMI-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; CHECK-VBMI-NEXT: # ymm2 = mem[0,1,0,1]
+; CHECK-VBMI-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
+; CHECK-VBMI-NEXT: vpsllw $2, %ymm1, %ymm3
+; CHECK-VBMI-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
+; CHECK-VBMI-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; CHECK-VBMI-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; CHECK-VBMI-NEXT: vpaddb %ymm1, %ymm1, %ymm3
+; CHECK-VBMI-NEXT: vpaddb %ymm2, %ymm2, %ymm2
+; CHECK-VBMI-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; CHECK-VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; CHECK-VBMI-NEXT: vpsllvw {{.*}}(%rip), %ymm3, %ymm3
+; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; CHECK-VBMI-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,33,35,37,39,41,43,45,47,17,19,21,23,25,27,29,31,49,51,53,55,57,59,61,63]
+; CHECK-VBMI-NEXT: vpermi2b %ymm3, %ymm0, %ymm2
+; CHECK-VBMI-NEXT: vpor %ymm2, %ymm1, %ymm0
+; CHECK-VBMI-NEXT: retq
+ %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
+ %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+ %or = or <32 x i8> %shl, %lshr
+ ret <32 x i8> %or
+}
+
+define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
+; CHECK-LABEL: splatconstant_rotate_v32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1
+; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0
+; CHECK-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+ %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+ %or = or <32 x i8> %shl, %lshr
+ ret <32 x i8> %or
+}
+
+define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
+; CHECK-LABEL: splatconstant_rotate_mask_v32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1
+; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+ %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+ %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
+ %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
+ %or = or <32 x i8> %lmask, %rmask
+ ret <32 x i8> %or
+}
More information about the llvm-commits
mailing list