[llvm] fbfd78f - [X86] lowerShuffleAsRepeatedMaskAndLanePermute - allow v16i32 sub-lane permutes for v64i8 shuffles
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 3 02:05:40 PDT 2022
Author: Simon Pilgrim
Date: 2022-04-03T10:05:10+01:00
New Revision: fbfd78f7aae520421adf4e1f7b35a48a3fb5a8be
URL: https://github.com/llvm/llvm-project/commit/fbfd78f7aae520421adf4e1f7b35a48a3fb5a8be
DIFF: https://github.com/llvm/llvm-project/commit/fbfd78f7aae520421adf4e1f7b35a48a3fb5a8be.diff
LOG: [X86] lowerShuffleAsRepeatedMaskAndLanePermute - allow v16i32 sub-lane permutes for v64i8 shuffles
Without VBMI, we are better off permuting v16i32 sub-lanes, even though its a variable shuffle, if it allows us to then shuffle v64i8 inlane repeated masks (PSHUFB etc.)
Fixes #54658
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
llvm/test/CodeGen/X86/x86-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8ad7d9cf49b4f..4ba32bd578539 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17277,13 +17277,14 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
return SDValue();
// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
- // (with PERMQ/PERMPD). On AVX512BW targets, permuting 64-bit sub-lanes, even
+ // (with PERMQ/PERMPD). On AVX512BW targets, permuting 32-bit sub-lanes, even
// with a variable shuffle, is worth it for 64xi8 vectors. Otherwise we can
// only permute whole 128-bit lanes.
int SubLaneScale = 1;
- if ((Subtarget.hasAVX2() && VT.is256BitVector()) ||
- (Subtarget.hasBWI() && VT == MVT::v64i8))
+ if (Subtarget.hasAVX2() && VT.is256BitVector())
SubLaneScale = 2;
+ if (Subtarget.hasBWI() && VT == MVT::v64i8)
+ SubLaneScale = 4;
int NumSubLanes = NumLanes * SubLaneScale;
int NumSubLaneElts = NumLaneElts / SubLaneScale;
@@ -17292,9 +17293,9 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
// determine the source sub-lane for each destination sub-lane.
int TopSrcSubLane = -1;
SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
- SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
- SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
- SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
+ SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
+ SubLaneScale,
+ SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
// Extract the sub-lane mask, check that it all comes from the same lane
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index 518804fcae653..7bb5133f32469 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -976,84 +976,54 @@ define void @load_i8_stride4_vf32(<128 x i8>* %in.vec, <32 x i8>* %out.vec0, <32
;
; AVX512-LABEL: load_i8_stride4_vf32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1
-; AVX512-NEXT: vpmovdb %ymm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vmovdqu64 (%rdi), %zmm1
-; AVX512-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm9
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm13
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm6
-; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm7
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm7, %xmm10, %xmm0
-; AVX512-NEXT: vpshufb %xmm7, %xmm9, %xmm2
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm6[6,7]
-; AVX512-NEXT: vmovdqa (%rdi), %xmm12
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm6
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm2
-; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm2
-; AVX512-NEXT: vpshufb %xmm7, %xmm12, %xmm7
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm7
-; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm7, %xmm10, %xmm4
-; AVX512-NEXT: vpshufb %xmm7, %xmm9, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm4
-; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm4
-; AVX512-NEXT: vpshufb %xmm7, %xmm12, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm4
-; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm5, %xmm10, %xmm7
-; AVX512-NEXT: vpshufb %xmm5, %xmm9, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX512-NEXT: vpshufb %xmm5, %xmm6, %xmm2
-; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm3
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vmovdqa %ymm8, (%rsi)
-; AVX512-NEXT: vmovdqa %ymm11, (%rdx)
-; AVX512-NEXT: vmovdqa %ymm15, (%rcx)
+; AVX512-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2
+; AVX512-NEXT: vmovdqa 96(%rdi), %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm5
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7]
+; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm4
+; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm6
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,8,12,1,5,9,13]
+; AVX512-NEXT: vpermd %zmm4, %zmm5, %zmm4
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm7
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm8, %ymm1, %ymm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
+; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm6
+; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm8
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-NEXT: vpermd %zmm6, %zmm5, %zmm6
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm8
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm10
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7]
+; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm7
+; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512-NEXT: vpermd %zmm7, %zmm5, %zmm7
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7]
+; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm2
+; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpermd %zmm0, %zmm5, %zmm0
+; AVX512-NEXT: vmovdqa %ymm4, (%rsi)
+; AVX512-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX512-NEXT: vmovdqa %ymm7, (%rcx)
; AVX512-NEXT: vmovdqa %ymm0, (%r8)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 92fede6ec0abd..74528afed7b56 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -227,19 +227,9 @@ define <64 x i8> @shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_
;
; AVX512BW-LABEL: shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[10,13,15,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[0,1]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,1,3,2,5,7,6,9,11,10,13,15,14,17,19,18,21,17,19,18,21,17,19,18,21,17,19,18,21]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,1,3,2,5,7,6,9,11]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,1,3,2,5]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,2,5,7,6,9,11,10,13,15,14,u,u,u,u,23,22,25,27,26,29,31,30,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,2,5,7,6,9,11,10,13,15,14,u,u,u,u,17,19,18,21,23,22,25,27,26,29,31,30,u,u,u,u,33,35,34,37,39,38,41,43,42,45,47,46,u,u,u,u,49,51,50,53,55,54,57,59,58,61,63,62,u,u,u,u]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,4,5,6,8,9,10,12,13,14,0,0,0,0]
+; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_01_03_02_05_07_06_09_11_10_13_15_14_17_19_18_21_23_22_25_27_26_29_31_30_33_35_34_37_39_38_41_43_42_45_47_46_49_51_50_53_55_54_57_59_58_61_63_62_01_03_02_05_01_03_02_05_01_03_02_05_01_03_02_05:
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 0c5feb06243b4..64947cb8f41b6 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -762,79 +762,53 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
;
; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm12
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm0
-; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm4
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm4
-; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm6
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vmovdqa (%rdi), %xmm13
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm6
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm7
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3
-; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm2
-; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm3
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm4
-; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm4
-; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm4
-; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm4
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5
-; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm2
-; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm3
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vpcmpeqb %zmm8, %zmm9, %k0
-; AVX512-NEXT: vpcmpeqb %zmm0, %zmm15, %k1
+; AVX512-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512-NEXT: vmovdqa 96(%rdi), %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm5
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7]
+; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm4
+; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm6
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,8,12,1,5,9,13]
+; AVX512-NEXT: vpermd %zmm4, %zmm5, %zmm4
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm7
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
+; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm6
+; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm8
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-NEXT: vpermd %zmm6, %zmm5, %zmm6
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm8
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm10
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7]
+; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm7
+; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0],ymm7[1],ymm9[2,3,4],ymm7[5],ymm9[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512-NEXT: vpermd %zmm7, %zmm5, %zmm7
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7]
+; AVX512-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vpermd %zmm0, %zmm5, %zmm0
+; AVX512-NEXT: vpcmpeqb %zmm6, %zmm4, %k0
+; AVX512-NEXT: vpcmpeqb %zmm0, %zmm7, %k1
; AVX512-NEXT: kxnord %k1, %k0, %k0
; AVX512-NEXT: vpmovm2b %k0, %zmm0
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
More information about the llvm-commits
mailing list