[llvm] acf6c94 - [X86] Teach lower512BitShuffle to try bitmask and bitblend before splitting v32i16/v64i8 on av512f only targets.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 2 15:36:26 PDT 2020
Author: Craig Topper
Date: 2020-07-02T15:35:48-07:00
New Revision: acf6c94a3881859988c4cb62172e5bc08ece7f9a
URL: https://github.com/llvm/llvm-project/commit/acf6c94a3881859988c4cb62172e5bc08ece7f9a
DIFF: https://github.com/llvm/llvm-project/commit/acf6c94a3881859988c4cb62172e5bc08ece7f9a.diff
LOG: [X86] Teach lower512BitShuffle to try bitmask and bitblend before splitting v32i16/v64i8 on av512f only targets.
We consider v32i16/v64i8 to be legal types on avx512f, but we
don't have most operations until avx512bw. But we can use
and/or/xor operations. So try those before splitting.
This is especially helpful since we turn some ands with constant
masks into shuffles in early DAG combines. So we should make sure
we recover those back to AND.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-mask-op.ll
llvm/test/CodeGen/X86/combine-sdiv.ll
llvm/test/CodeGen/X86/pr45443.ll
llvm/test/CodeGen/X86/vector-fshl-512.ll
llvm/test/CodeGen/X86/vector-fshr-512.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f2ede3f1989d..914f06169577 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17487,8 +17487,17 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Broadcast;
- if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
+ // Try using bit ops for masking and blending before falling back to
+ // splitting.
+ if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+ if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ return V;
+
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ }
// Dispatch to each element type for lowering. If we don't have support for
// specific element type shuffles at 512 bits, immediately split them and
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 98b2f30b43ca..879ea9146be6 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1867,10 +1867,7 @@ define void @store_i8_i1(i8 %x, i1 *%y) {
define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
; KNL-LABEL: test_build_vec_v32i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
-; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v32i1:
@@ -1885,10 +1882,7 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
;
; AVX512DQ-LABEL: test_build_vec_v32i1:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
-; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v32i1:
@@ -1902,10 +1896,7 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
define <32 x i16> @test_build_vec_v32i1_optsize(<32 x i16> %x) optsize {
; KNL-LABEL: test_build_vec_v32i1_optsize:
; KNL: ## %bb.0:
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
-; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v32i1_optsize:
@@ -1924,10 +1915,7 @@ define <32 x i16> @test_build_vec_v32i1_optsize(<32 x i16> %x) optsize {
;
; AVX512DQ-LABEL: test_build_vec_v32i1_optsize:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
-; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v32i1_optsize:
@@ -1943,10 +1931,7 @@ define <32 x i16> @test_build_vec_v32i1_optsize(<32 x i16> %x) optsize {
define <32 x i16> @test_build_vec_v32i1_pgso(<32 x i16> %x) !prof !14 {
; KNL-LABEL: test_build_vec_v32i1_pgso:
; KNL: ## %bb.0:
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
-; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v32i1_pgso:
@@ -1965,10 +1950,7 @@ define <32 x i16> @test_build_vec_v32i1_pgso(<32 x i16> %x) !prof !14 {
;
; AVX512DQ-LABEL: test_build_vec_v32i1_pgso:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
-; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v32i1_pgso:
@@ -1984,10 +1966,7 @@ define <32 x i16> @test_build_vec_v32i1_pgso(<32 x i16> %x) !prof !14 {
define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
; KNL-LABEL: test_build_vec_v64i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
-; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v64i1:
@@ -2002,10 +1981,7 @@ define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
;
; AVX512DQ-LABEL: test_build_vec_v64i1:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
-; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v64i1:
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 618b0a8d2606..a221e42ade42 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -933,26 +933,27 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
;
; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm2
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
-; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
-; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2
-; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
-; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
+; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT: vpsravd %zmm3, %zmm1, %zmm1
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpsraw $15, %ymm4, %ymm5
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm5, %ymm2
+; AVX512F-NEXT: vpaddw %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
-; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2
+; AVX512F-NEXT: vpsravd %zmm3, %zmm2, %zmm2
; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
diff --git a/llvm/test/CodeGen/X86/pr45443.ll b/llvm/test/CodeGen/X86/pr45443.ll
index 081451a2758c..1e40ab94e9ca 100644
--- a/llvm/test/CodeGen/X86/pr45443.ll
+++ b/llvm/test/CodeGen/X86/pr45443.ll
@@ -3,29 +3,10 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
define <16 x float> @PR45443() {
-; X86-LABEL: PR45443:
-; X86: # %bb.0: # %bb
-; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080]
-; X86-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0
-; X86-NEXT: vpcmpltud {{\.LCPI.*}}{1to16}, %zmm1, %k1
-; X86-NEXT: vpbroadcastd {{.*#+}} ymm2 = [16777215,16777215,16777215,16777215,16777215,16777215,16777215,16777215]
-; X86-NEXT: vpand %ymm2, %ymm1, %ymm1
-; X86-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
-; X86-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
-; X86-NEXT: vbroadcastss {{\.LCPI.*}}, %zmm0 {%k1}
-; X86-NEXT: retl
-;
-; X64-LABEL: PR45443:
-; X64: # %bb.0: # %bb
-; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080,2181038080]
-; X64-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0
-; X64-NEXT: vpcmpltud {{.*}}(%rip){1to16}, %zmm1, %k1
-; X64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [16777215,16777215,16777215,16777215,16777215,16777215,16777215,16777215]
-; X64-NEXT: vpand %ymm2, %ymm1, %ymm1
-; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1
-; X64-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
-; X64-NEXT: vbroadcastss {{.*}}(%rip), %zmm0 {%k1}
-; X64-NEXT: retq
+; CHECK-LABEL: PR45443:
+; CHECK: # %bb.0: # %bb
+; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0
+; CHECK-NEXT: ret{{[l|q]}}
bb:
%tmp = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> <i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040, i32 1090519040>, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>)
%tmp4 = tail call fast <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> <float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000, float 0x3FE6300000000000>, <16 x float> undef)
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 20cfba41a1c4..83a74c657dca 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -1075,17 +1075,16 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm5
-; AVX512F-NEXT: vpor %ymm2, %ymm5, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
@@ -1093,17 +1092,16 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512VL-NEXT: vpmullw %ymm3, %ymm4, %ymm5
-; AVX512VL-NEXT: vpor %ymm2, %ymm5, %ymm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm3
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vporq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512VL-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i16:
@@ -1150,49 +1148,48 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
-; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm8
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm6
-; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm9
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm6
-; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15],ymm6[24],ymm10[24],ymm6[25],ymm10[25],ymm6[26],ymm10[26],ymm6[27],ymm10[27],ymm6[28],ymm10[28],ymm6[29],ymm10[29],ymm6[30],ymm10[30],ymm6[31],ymm10[31]
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
-; AVX512F-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm12, %ymm11, %ymm11
-; AVX512F-NEXT: vpsrlw $8, %ymm11, %ymm11
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[16],ymm10[16],ymm6[17],ymm10[17],ymm6[18],ymm10[18],ymm6[19],ymm10[19],ymm6[20],ymm10[20],ymm6[21],ymm10[21],ymm6[22],ymm10[22],ymm6[23],ymm10[23]
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
-; AVX512F-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm6, %ymm13, %ymm6
-; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
-; AVX512F-NEXT: vpackuswb %ymm11, %ymm6, %ymm6
-; AVX512F-NEXT: vpor %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm7
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3
; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
-; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm10[8],ymm1[9],ymm10[9],ymm1[10],ymm10[10],ymm1[11],ymm10[11],ymm1[12],ymm10[12],ymm1[13],ymm10[13],ymm1[14],ymm10[14],ymm1[15],ymm10[15],ymm1[24],ymm10[24],ymm1[25],ymm10[25],ymm1[26],ymm10[26],ymm1[27],ymm10[27],ymm1[28],ymm10[28],ymm1[29],ymm10[29],ymm1[30],ymm10[30],ymm1[31],ymm10[31]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm12, %ymm4
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[4],ymm10[4],ymm1[5],ymm10[5],ymm1[6],ymm10[6],ymm1[7],ymm10[7],ymm1[16],ymm10[16],ymm1[17],ymm10[17],ymm1[18],ymm10[18],ymm1[19],ymm10[19],ymm1[20],ymm10[20],ymm1[21],ymm10[21],ymm1[22],ymm10[22],ymm1[23],ymm10[23]
-; AVX512F-NEXT: vpmullw %ymm1, %ymm13, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512F-NEXT: # ymm7 = mem[0,1,0,1]
+; AVX512F-NEXT: vpmullw %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23]
+; AVX512F-NEXT: vpmullw %ymm7, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
@@ -1203,49 +1200,48 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm3
-; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm8
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm6
-; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm9
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm6
-; AVX512VL-NEXT: vpxor %xmm10, %xmm10, %xmm10
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm6[8],ymm10[8],ymm6[9],ymm10[9],ymm6[10],ymm10[10],ymm6[11],ymm10[11],ymm6[12],ymm10[12],ymm6[13],ymm10[13],ymm6[14],ymm10[14],ymm6[15],ymm10[15],ymm6[24],ymm10[24],ymm6[25],ymm10[25],ymm6[26],ymm10[26],ymm6[27],ymm10[27],ymm6[28],ymm10[28],ymm6[29],ymm10[29],ymm6[30],ymm10[30],ymm6[31],ymm10[31]
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
-; AVX512VL-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm12, %ymm11, %ymm11
-; AVX512VL-NEXT: vpsrlw $8, %ymm11, %ymm11
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm10[0],ymm6[1],ymm10[1],ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[4],ymm10[4],ymm6[5],ymm10[5],ymm6[6],ymm10[6],ymm6[7],ymm10[7],ymm6[16],ymm10[16],ymm6[17],ymm10[17],ymm6[18],ymm10[18],ymm6[19],ymm10[19],ymm6[20],ymm10[20],ymm6[21],ymm10[21],ymm6[22],ymm10[22],ymm6[23],ymm10[23]
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
-; AVX512VL-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm6, %ymm13, %ymm6
-; AVX512VL-NEXT: vpsrlw $8, %ymm6, %ymm6
-; AVX512VL-NEXT: vpackuswb %ymm11, %ymm6, %ymm6
-; AVX512VL-NEXT: vpor %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm3
; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
-; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm10[8],ymm1[9],ymm10[9],ymm1[10],ymm10[10],ymm1[11],ymm10[11],ymm1[12],ymm10[12],ymm1[13],ymm10[13],ymm1[14],ymm10[14],ymm1[15],ymm10[15],ymm1[24],ymm10[24],ymm1[25],ymm10[25],ymm1[26],ymm10[26],ymm1[27],ymm10[27],ymm1[28],ymm10[28],ymm1[29],ymm10[29],ymm1[30],ymm10[30],ymm1[31],ymm10[31]
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm12, %ymm4
-; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[4],ymm10[4],ymm1[5],ymm10[5],ymm1[6],ymm10[6],ymm1[7],ymm10[7],ymm1[16],ymm10[16],ymm1[17],ymm10[17],ymm1[18],ymm10[18],ymm1[19],ymm10[19],ymm1[20],ymm10[20],ymm1[21],ymm10[21],ymm1[22],ymm10[22],ymm1[23],ymm10[23]
-; AVX512VL-NEXT: vpmullw %ymm1, %ymm13, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512VL-NEXT: # ymm7 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpmullw %ymm7, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23]
+; AVX512VL-NEXT: vpmullw %ymm7, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512VL-NEXT: vporq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512VL-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index fd59d40cefc8..aa7a0e63f1e7 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -1062,36 +1062,34 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
-; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm5
-; AVX512F-NEXT: vpor %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm4
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm2
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512F-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT: vpternlogq $202, %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5
-; AVX512VL-NEXT: vpmullw %ymm3, %ymm5, %ymm5
-; AVX512VL-NEXT: vpor %ymm4, %ymm5, %ymm4
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpmulhuw %ymm3, %ymm1, %ymm4
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512VL-NEXT: vpmullw %ymm3, %ymm4, %ymm4
; AVX512VL-NEXT: vpmullw %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm2
+; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512VL-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT: vpternlogq $202, %zmm1, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i16:
@@ -1147,22 +1145,6 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm3[8],ymm9[8],ymm3[9],ymm9[9],ymm3[10],ymm9[10],ymm3[11],ymm9[11],ymm3[12],ymm9[12],ymm3[13],ymm9[13],ymm3[14],ymm9[14],ymm3[15],ymm9[15],ymm3[24],ymm9[24],ymm3[25],ymm9[25],ymm3[26],ymm9[26],ymm3[27],ymm9[27],ymm3[28],ymm9[28],ymm3[29],ymm9[29],ymm3[30],ymm9[30],ymm3[31],ymm9[31]
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
-; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm11, %ymm10, %ymm10
-; AVX512F-NEXT: vpsrlw $8, %ymm10, %ymm10
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[4],ymm9[4],ymm3[5],ymm9[5],ymm3[6],ymm9[6],ymm3[7],ymm9[7],ymm3[16],ymm9[16],ymm3[17],ymm9[17],ymm3[18],ymm9[18],ymm3[19],ymm9[19],ymm3[20],ymm9[20],ymm3[21],ymm9[21],ymm3[22],ymm9[22],ymm3[23],ymm9[23]
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
-; AVX512F-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm13, %ymm12, %ymm12
-; AVX512F-NEXT: vpsrlw $8, %ymm12, %ymm12
-; AVX512F-NEXT: vpackuswb %ymm10, %ymm12, %ymm10
-; AVX512F-NEXT: vpor %ymm2, %ymm10, %ymm2
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm0
@@ -1171,16 +1153,31 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm9[8],ymm1[9],ymm9[9],ymm1[10],ymm9[10],ymm1[11],ymm9[11],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15],ymm1[24],ymm9[24],ymm1[25],ymm9[25],ymm1[26],ymm9[26],ymm1[27],ymm9[27],ymm1[28],ymm9[28],ymm1[29],ymm9[29],ymm1[30],ymm9[30],ymm1[31],ymm9[31]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm11, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[16],ymm9[16],ymm1[17],ymm9[17],ymm1[18],ymm9[18],ymm1[19],ymm9[19],ymm1[20],ymm9[20],ymm1[21],ymm9[21],ymm1[22],ymm9[22],ymm1[23],ymm9[23]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm13, %ymm4
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512F-NEXT: vpternlogq $202, %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
@@ -1200,22 +1197,6 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm3[8],ymm9[8],ymm3[9],ymm9[9],ymm3[10],ymm9[10],ymm3[11],ymm9[11],ymm3[12],ymm9[12],ymm3[13],ymm9[13],ymm3[14],ymm9[14],ymm3[15],ymm9[15],ymm3[24],ymm9[24],ymm3[25],ymm9[25],ymm3[26],ymm9[26],ymm3[27],ymm9[27],ymm3[28],ymm9[28],ymm3[29],ymm9[29],ymm3[30],ymm9[30],ymm3[31],ymm9[31]
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
-; AVX512VL-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm11, %ymm10, %ymm10
-; AVX512VL-NEXT: vpsrlw $8, %ymm10, %ymm10
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[4],ymm9[4],ymm3[5],ymm9[5],ymm3[6],ymm9[6],ymm3[7],ymm9[7],ymm3[16],ymm9[16],ymm3[17],ymm9[17],ymm3[18],ymm9[18],ymm3[19],ymm9[19],ymm3[20],ymm9[20],ymm3[21],ymm9[21],ymm3[22],ymm9[22],ymm3[23],ymm9[23]
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
-; AVX512VL-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm13, %ymm12, %ymm12
-; AVX512VL-NEXT: vpsrlw $8, %ymm12, %ymm12
-; AVX512VL-NEXT: vpackuswb %ymm10, %ymm12, %ymm10
-; AVX512VL-NEXT: vpor %ymm2, %ymm10, %ymm2
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX512VL-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm0, %ymm0
@@ -1224,16 +1205,31 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm9[8],ymm1[9],ymm9[9],ymm1[10],ymm9[10],ymm1[11],ymm9[11],ymm1[12],ymm9[12],ymm1[13],ymm9[13],ymm1[14],ymm9[14],ymm1[15],ymm9[15],ymm1[24],ymm9[24],ymm1[25],ymm9[25],ymm1[26],ymm9[26],ymm1[27],ymm9[27],ymm1[28],ymm9[28],ymm1[29],ymm9[29],ymm1[30],ymm9[30],ymm1[31],ymm9[31]
-; AVX512VL-NEXT: vpmullw %ymm3, %ymm11, %ymm3
-; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm9[0],ymm1[1],ymm9[1],ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[4],ymm9[4],ymm1[5],ymm9[5],ymm1[6],ymm9[6],ymm1[7],ymm9[7],ymm1[16],ymm9[16],ymm1[17],ymm9[17],ymm1[18],ymm9[18],ymm1[19],ymm9[19],ymm1[20],ymm9[20],ymm1[21],ymm9[21],ymm1[22],ymm9[22],ymm1[23],ymm9[23]
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm13, %ymm4
-; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512VL-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
+; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512VL-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
+; AVX512VL-NEXT: vpmullw %ymm5, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX512VL-NEXT: vpmullw %ymm6, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm2
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
+; AVX512VL-NEXT: vpternlogq $202, %zmm1, %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 59c476901782..b2d4dc76a10e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -201,8 +201,8 @@ define <32 x i16> @shuffle_v32i16_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_1
define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; KNL: ## %bb.0:
-; KNL-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
-; KNL-NEXT: vandps %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index fb40e96ee8a6..3199cc0fa9b9 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -109,8 +109,8 @@ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0]
-; AVX512F-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
@@ -122,7 +122,7 @@ define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0]
-; AVX512DQ-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vandps %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index 1891fb5a4683..112fd4beed99 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -313,11 +313,8 @@ define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){
; KNL-LABEL: test_mm512_mask_blend_epi8:
; KNL: # %bb.0: # %entry
; KNL-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; KNL-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm3
-; KNL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
+; KNL-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
; KNL-NEXT: ret{{[l|q]}}
;
; SKX32-LABEL: test_mm512_mask_blend_epi8:
@@ -342,11 +339,8 @@ define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){
;
; KNL-LABEL: test_mm512_mask_blend_epi16:
; KNL: # %bb.0: # %entry
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; KNL-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7],ymm3[8],ymm2[9],ymm3[10],ymm2[11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
-; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL-NEXT: vpbroadcastd {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; KNL-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0
; KNL-NEXT: ret{{[l|q]}}
entry:
%0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
More information about the llvm-commits
mailing list