[llvm] r322090 - X86 Tests: Update more isel tests with FastVariableShuffle feature
Zvi Rackover via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 9 08:26:06 PST 2018
Author: zvi
Date: Tue Jan 9 08:26:06 2018
New Revision: 322090
URL: http://llvm.org/viewvc/llvm-project?rev=322090&view=rev
Log:
X86 Tests: Update more isel tests with FastVariableShuffle feature
Summary:
Added the FastVariableShuffle feature to cases that resembled processors
for which this fearure is on.
For AVX2 there are processors with and w/o this fearue enable.
For AVX512 only KNL does enable this feature so cases which only have
+avx512f were left without the FastVariableShuffle enabled.
Reviewers: RKSimon, craig.topper
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D41851
Modified:
llvm/trunk/test/CodeGen/X86/avx2-conversions.ll
llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll
llvm/trunk/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll
llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/trunk/test/CodeGen/X86/avx512-trunc.ll
llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
llvm/trunk/test/CodeGen/X86/broadcastm-lowering.ll
llvm/trunk/test/CodeGen/X86/combine-shl.ll
llvm/trunk/test/CodeGen/X86/combine-sra.ll
llvm/trunk/test/CodeGen/X86/combine-srl.ll
llvm/trunk/test/CodeGen/X86/insertelement-zero.ll
llvm/trunk/test/CodeGen/X86/oddshuffles.ll
llvm/trunk/test/CodeGen/X86/psubus.ll
llvm/trunk/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128.ll
llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll
llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll
llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
llvm/trunk/test/CodeGen/X86/vector-trunc.ll
llvm/trunk/test/CodeGen/X86/vector-zext.ll
Modified: llvm/trunk/test/CodeGen/X86/avx2-conversions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-conversions.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-conversions.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-conversions.ll Tue Jan 9 08:26:06 2018
@@ -1,23 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST
define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
-; X32-LABEL: trunc4:
-; X32: # %bb.0:
-; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X32-SLOW-LABEL: trunc4:
+; X32-SLOW: # %bb.0:
+; X32-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; X32-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X32-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; X32-SLOW-NEXT: vzeroupper
+; X32-SLOW-NEXT: retl
;
-; X64-LABEL: trunc4:
-; X64: # %bb.0:
-; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X32-FAST-LABEL: trunc4:
+; X32-FAST: # %bb.0:
+; X32-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; X32-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; X32-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; X32-FAST-NEXT: vzeroupper
+; X32-FAST-NEXT: retl
+;
+; X64-SLOW-LABEL: trunc4:
+; X64-SLOW: # %bb.0:
+; X64-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; X64-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X64-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; X64-SLOW-NEXT: vzeroupper
+; X64-SLOW-NEXT: retq
+;
+; X64-FAST-LABEL: trunc4:
+; X64-FAST: # %bb.0:
+; X64-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; X64-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; X64-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; X64-FAST-NEXT: vzeroupper
+; X64-FAST-NEXT: retq
%B = trunc <4 x i64> %A to <4 x i32>
ret <4 x i32>%B
}
Modified: llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll Tue Jan 9 08:26:06 2018
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST
; AVX2 Logical Shift Left
@@ -372,25 +374,45 @@ entry:
}
define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
-; X32-LABEL: srl_trunc_and_v4i64:
-; X32: # %bb.0:
-; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; X32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; X32-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
-; X32-NEXT: vpand %xmm2, %xmm1, %xmm1
-; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
-;
-; X64-LABEL: srl_trunc_and_v4i64:
-; X64: # %bb.0:
-; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; X64-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
-; X64-NEXT: vpand %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X32-SLOW-LABEL: srl_trunc_and_v4i64:
+; X32-SLOW: # %bb.0:
+; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; X32-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; X32-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
+; X32-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X32-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; X32-SLOW-NEXT: vzeroupper
+; X32-SLOW-NEXT: retl
+;
+; X32-FAST-LABEL: srl_trunc_and_v4i64:
+; X32-FAST: # %bb.0:
+; X32-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; X32-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; X32-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
+; X32-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X32-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; X32-FAST-NEXT: vzeroupper
+; X32-FAST-NEXT: retl
+;
+; X64-SLOW-LABEL: srl_trunc_and_v4i64:
+; X64-SLOW: # %bb.0:
+; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; X64-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; X64-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
+; X64-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; X64-SLOW-NEXT: vzeroupper
+; X64-SLOW-NEXT: retq
+;
+; X64-FAST-LABEL: srl_trunc_and_v4i64:
+; X64-FAST: # %bb.0:
+; X64-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; X64-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; X64-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
+; X64-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X64-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; X64-FAST-NEXT: vzeroupper
+; X64-FAST-NEXT: retq
%and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
%trunc = trunc <4 x i64> %and to <4 x i32>
%sra = lshr <4 x i32> %x, %trunc
Modified: llvm/trunk/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-extract-subvector-load-store.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-extract-subvector-load-store.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-extract-subvector-load-store.ll Tue Jan 9 08:26:06 2018
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -O2 | FileCheck %s --check-prefix=AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl -O2 | FileCheck %s --check-prefix=AVX512NOTDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512NOTDQ
define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
@@ -331,8 +331,8 @@ define void @load_v32i1_broadcast_31_v8i
; AVX512-NEXT: kmovd (%rdi), %k0
; AVX512-NEXT: kshiftrd $24, %k0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm2
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
+; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpmovd2m %ymm2, %k1
; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512-NEXT: vmovaps %ymm1, (%rsi)
@@ -345,8 +345,8 @@ define void @load_v32i1_broadcast_31_v8i
; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
-; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
-; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
+; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2
; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
@@ -541,8 +541,8 @@ define void @load_v64i1_broadcast_63_v8i
; AVX512-NEXT: kmovq (%rdi), %k0
; AVX512-NEXT: kshiftrq $56, %k0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm2
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
+; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpmovd2m %ymm2, %k1
; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512-NEXT: vmovaps %ymm1, (%rsi)
@@ -555,8 +555,8 @@ define void @load_v64i1_broadcast_63_v8i
; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
-; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
-; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7]
+; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2
; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
@@ -1134,8 +1134,8 @@ define void @load_v32i1_broadcast_31_v8i
; AVX512-NEXT: kmovd (%rdi), %k0
; AVX512-NEXT: kshiftrd $24, %k0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm0
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
+; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovd2m %ymm0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: vzeroupper
@@ -1147,8 +1147,8 @@ define void @load_v32i1_broadcast_31_v8i
; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
-; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
+; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
@@ -1369,8 +1369,8 @@ define void @load_v64i1_broadcast_63_v8i
; AVX512-NEXT: kmovq (%rdi), %k0
; AVX512-NEXT: kshiftrq $56, %k0, %k0
; AVX512-NEXT: vpmovm2d %k0, %ymm0
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
+; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovd2m %ymm0, %k0
; AVX512-NEXT: kmovb %k0, (%rsi)
; AVX512-NEXT: vzeroupper
@@ -1382,8 +1382,8 @@ define void @load_v64i1_broadcast_63_v8i
; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
-; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7]
+; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512NOTDQ-NEXT: kmovd %k0, %eax
; AVX512NOTDQ-NEXT: movb %al, (%rsi)
Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll Tue Jan 9 08:26:06 2018
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq,+fast-variable-shuffle %s -o - | FileCheck %s
; FIXME: fixing PR34394 should fix the i32x2 memory cases resulting in a simple vbroadcasti32x2 instruction.
@@ -459,8 +459,8 @@ define <4 x i32> @test_masked_z_2xi32_to
define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
; CHECK-LABEL: test_2xi32_to_8xi32_mem:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
; CHECK-NEXT: retq
%vec = load <2 x i32>, <2 x i32>* %vp
@@ -470,8 +470,8 @@ define <8 x i32> @test_2xi32_to_8xi32_me
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
@@ -486,8 +486,8 @@ define <8 x i32> @test_masked_2xi32_to_8
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
@@ -501,8 +501,8 @@ define <8 x i32> @test_masked_z_2xi32_to
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
@@ -517,8 +517,8 @@ define <8 x i32> @test_masked_2xi32_to_8
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
@@ -532,8 +532,8 @@ define <8 x i32> @test_masked_z_2xi32_to
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
@@ -548,8 +548,8 @@ define <8 x i32> @test_masked_2xi32_to_8
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
@@ -563,8 +563,8 @@ define <8 x i32> @test_masked_z_2xi32_to
define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
@@ -579,8 +579,8 @@ define <8 x i32> @test_masked_2xi32_to_8
define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll Tue Jan 9 08:26:06 2018
@@ -1,16 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle %s -o - | FileCheck %s
; FIXME: All cases here should be fixed by PR34380
define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,6,4]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4],xmm0[5,6,7]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -20,11 +18,9 @@ define <8 x i16> @test_16xi16_to_8xi16_p
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,6,4]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
@@ -40,11 +36,9 @@ define <8 x i16> @test_masked_16xi16_to_
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,7,6,6,4]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9]
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1]
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
@@ -59,11 +53,10 @@ define <8 x i16> @test_masked_z_16xi16_t
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,12,13,14,15]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
@@ -78,11 +71,10 @@ define <8 x i16> @test_masked_16xi16_to_
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,12,13,14,15]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
@@ -96,11 +88,10 @@ define <8 x i16> @test_masked_z_16xi16_t
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5,6],xmm3[7]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,10,11,4,5,6,7,14,15,2,3,12,13,14,15]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3],xmm3[4,5,6],xmm0[7]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
@@ -115,11 +106,10 @@ define <8 x i16> @test_masked_16xi16_to_
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4,5,6],xmm2[7]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,10,11,4,5,6,7,14,15,2,3,12,13,14,15]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4,5,6],xmm0[7]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
@@ -181,11 +171,10 @@ define <8 x i16> @test_16xi16_to_8xi16_p
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,0]
-; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6],xmm0[7]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %vp
@@ -196,11 +185,10 @@ define <8 x i16> @test_masked_16xi16_to_
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
-; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
-; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,0]
-; CHECK-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6],xmm2[7]
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
@@ -217,11 +205,10 @@ define <8 x i16> @test_masked_z_16xi16_t
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
-; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,0]
-; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6],xmm1[7]
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
@@ -2192,11 +2179,11 @@ define <4 x i64> @test_masked_8xi64_to_4
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1]
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,2,5]
+; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2207,11 +2194,12 @@ define <4 x i64> @test_masked_8xi64_to_4
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,2,5]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2222,11 +2210,11 @@ define <4 x i64> @test_masked_8xi64_to_4
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,2,7]
+; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2237,11 +2225,12 @@ define <4 x i64> @test_masked_8xi64_to_4
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,2,7]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2251,9 +2240,10 @@ define <4 x i64> @test_masked_z_8xi64_to
define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [2,4,4,3]
+; CHECK-NEXT: vpermi2q %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
ret <4 x i64> %res
@@ -2262,11 +2252,11 @@ define <4 x i64> @test_masked_8xi64_to_4
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3]
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,4,4,3]
+; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2277,11 +2267,12 @@ define <4 x i64> @test_masked_8xi64_to_4
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,4,4,3]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2292,11 +2283,11 @@ define <4 x i64> @test_masked_8xi64_to_4
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,3,1]
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,3,1]
+; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2307,11 +2298,12 @@ define <4 x i64> @test_masked_8xi64_to_4
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,3,1]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,3,3,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2352,9 +2344,10 @@ define <4 x i64> @test_masked_z_8xi64_to
define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3]
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,7]
+; CHECK-NEXT: vpermi2q %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
ret <4 x i64> %res
@@ -2363,11 +2356,11 @@ define <4 x i64> @test_masked_8xi64_to_4
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,2,1,7]
+; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2378,12 +2371,12 @@ define <4 x i64> @test_masked_8xi64_to_4
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,7]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2537,10 +2530,11 @@ define <4 x i64> @test_masked_8xi64_to_4
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,4]
+; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
@@ -2552,12 +2546,13 @@ define <4 x i64> @test_masked_8xi64_to_4
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0]
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
@@ -2571,10 +2566,11 @@ define <4 x i64> @test_masked_8xi64_to_4
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,5,5,1]
+; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
@@ -2586,12 +2582,13 @@ define <4 x i64> @test_masked_8xi64_to_4
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0]
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
@@ -2603,10 +2600,10 @@ define <4 x i64> @test_masked_z_8xi64_to
define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) {
; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3]
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2]
+; CHECK-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
@@ -2617,10 +2614,11 @@ define <4 x i64> @test_masked_8xi64_to_4
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,0,0,2]
+; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
@@ -2632,12 +2630,13 @@ define <4 x i64> @test_masked_8xi64_to_4
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3]
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
@@ -2687,10 +2686,11 @@ define <4 x i64> @test_masked_8xi64_to_4
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,7,1]
+; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
@@ -2702,12 +2702,13 @@ define <4 x i64> @test_masked_8xi64_to_4
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1]
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
@@ -2769,10 +2770,11 @@ define <4 x i64> @test_masked_8xi64_to_4
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,1,5]
+; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
@@ -2784,12 +2786,13 @@ define <4 x i64> @test_masked_8xi64_to_4
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
-; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1]
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x i64>, <8 x i64>* %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
@@ -3465,11 +3468,11 @@ define <4 x float> @test_masked_z_16xflo
define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
@@ -3478,11 +3481,11 @@ define <4 x float> @test_16xfloat_to_4xf
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
-; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7]
+; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm3
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3]
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
@@ -3497,11 +3500,11 @@ define <4 x float> @test_masked_16xfloat
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
-; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
@@ -4022,9 +4025,10 @@ define <2 x double> @test_masked_z_4xdou
define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,7,3,7]
+; CHECK-NEXT: vpermi2pd %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
ret <4 x double> %res
@@ -4032,12 +4036,12 @@ define <4 x double> @test_8xdouble_to_4x
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
-; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,7,3,7]
+; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4048,11 +4052,12 @@ define <4 x double> @test_masked_8xdoubl
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,7,3,7]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4191,12 +4196,12 @@ define <4 x double> @test_masked_z_8xdou
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,2]
-; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [2,6,2,2]
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4207,11 +4212,12 @@ define <4 x double> @test_masked_8xdoubl
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,2]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,6,2,2]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4221,9 +4227,10 @@ define <4 x double> @test_masked_z_8xdou
define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[2]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,4,3,4]
+; CHECK-NEXT: vpermi2pd %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
ret <4 x double> %res
@@ -4232,11 +4239,11 @@ define <4 x double> @test_masked_8xdoubl
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm0[0],ymm3[3],ymm0[2]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,1]
-; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,4,3,4]
+; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4247,11 +4254,12 @@ define <4 x double> @test_masked_8xdoubl
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm0[0],ymm2[3],ymm0[2]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,1]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,4,3,4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4262,11 +4270,11 @@ define <4 x double> @test_masked_8xdoubl
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,1,0,2]
-; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,5,0,6]
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4277,11 +4285,12 @@ define <4 x double> @test_masked_8xdoubl
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,1,0,2]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4291,10 +4300,10 @@ define <4 x double> @test_masked_z_8xdou
define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,6,2,6]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
@@ -4303,12 +4312,12 @@ define <2 x double> @test_8xdouble_to_2x
define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,6,2,6]
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqpd %xmm0, %xmm2, %k1
+; CHECK-NEXT: vblendmpd %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
@@ -4320,12 +4329,12 @@ define <2 x double> @test_masked_8xdoubl
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,6,2,6]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqpd %xmm0, %xmm1, %k1
+; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
@@ -4419,10 +4428,11 @@ define <4 x double> @test_masked_8xdoubl
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %zmm2
; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[3,0,2,0]
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,4,2,4]
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
@@ -4434,12 +4444,13 @@ define <4 x double> @test_masked_8xdoubl
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %zmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,2,0]
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
@@ -4453,10 +4464,11 @@ define <4 x double> @test_masked_8xdoubl
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %zmm2
; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[1,2,3,0]
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,2,3,4]
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -4468,12 +4480,13 @@ define <4 x double> @test_masked_8xdoubl
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %zmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1,2,3,0]
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -4485,10 +4498,10 @@ define <4 x double> @test_masked_z_8xdou
define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) {
; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %zmm0
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,0]
-; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [4,2,1,0]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
@@ -4499,11 +4512,11 @@ define <4 x double> @test_masked_8xdoubl
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %zmm2
; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,0]
-; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [4,2,1,0]
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
@@ -4515,13 +4528,13 @@ define <4 x double> @test_masked_8xdoubl
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %zmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,0]
-; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,2,1,0]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
@@ -4571,10 +4584,11 @@ define <4 x double> @test_masked_8xdoubl
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %zmm2
; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,1]
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [6,1,1,1]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
@@ -4586,12 +4600,13 @@ define <4 x double> @test_masked_8xdoubl
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %zmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,1]
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [6,1,1,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
@@ -4605,9 +4620,9 @@ define <4 x double> @test_8xdouble_to_4x
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %zmm0
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
-; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [0,2,6,1]
+; CHECK-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
@@ -4619,11 +4634,11 @@ define <4 x double> @test_masked_8xdoubl
; CHECK-NEXT: vmovapd (%rdi), %zmm2
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3
; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm2
-; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,1]
-; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,6,1]
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
@@ -4637,12 +4652,12 @@ define <4 x double> @test_masked_z_8xdou
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %zmm1
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1]
-; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,6,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
@@ -4656,10 +4671,11 @@ define <4 x double> @test_masked_8xdoubl
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %zmm2
; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[0,1,2,1]
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,5,2,5]
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
@@ -4671,12 +4687,13 @@ define <4 x double> @test_masked_8xdoubl
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %zmm1
-; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[0,1,2,1]
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,5,2,5]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
@@ -4688,10 +4705,10 @@ define <4 x double> @test_masked_z_8xdou
define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) {
; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %zmm0
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,3,6]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm0
; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -4704,11 +4721,11 @@ define <2 x double> @test_masked_8xdoubl
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %zmm2
; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
-; CHECK-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[3],ymm3[2]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,6,3,6]
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovapd %xmm4, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
@@ -4723,11 +4740,11 @@ define <2 x double> @test_masked_z_8xdou
; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %zmm1
; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
-; CHECK-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm2[0],ymm1[3],ymm2[2]
-; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
-; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,3,6]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
Modified: llvm/trunk/test/CodeGen/X86/avx512-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-trunc.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-trunc.ll Tue Jan 9 08:26:06 2018
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,SKX
attributes #0 = { nounwind }
Modified: llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll Tue Jan 9 08:26:06 2018
@@ -2,8 +2,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512
;
; 128-bit vectors
@@ -379,19 +380,31 @@ define <32 x i8> @ext_i32_32i8(i32 %a0)
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: ext_i32_32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: ext_i32_32i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovd %edi, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-SLOW-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: ext_i32_32i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovd %edi, %xmm0
+; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: ext_i32_32i8:
; AVX512: # %bb.0:
@@ -683,26 +696,43 @@ define <64 x i8> @ext_i64_64i8(i64 %a0)
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
-; AVX2-LABEL: ext_i64_64i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovq %rdi, %xmm0
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
-; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: ext_i64_64i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovq %rdi, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
+; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: ext_i64_64i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovq %rdi, %xmm0
+; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
+; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: ext_i64_64i8:
; AVX512: # %bb.0:
Modified: llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll Tue Jan 9 08:26:06 2018
@@ -2,9 +2,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
;
; 128-bit vectors
@@ -472,21 +473,35 @@ define <32 x i8> @ext_i32_32i8(i32 %a0)
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: ext_i32_32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: ext_i32_32i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovd %edi, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-SLOW-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: ext_i32_32i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovd %edi, %xmm0
+; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: ext_i32_32i8:
; AVX512F: # %bb.0:
@@ -885,31 +900,53 @@ define <64 x i8> @ext_i64_64i8(i64 %a0)
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
-; AVX2-LABEL: ext_i64_64i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovq %rdi, %xmm0
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
-; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $7, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: ext_i64_64i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovq %rdi, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-SLOW-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
+; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
+; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpsrlw $7, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: ext_i64_64i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovq %rdi, %xmm0
+; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-FAST-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpsrlw $7, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: ext_i64_64i8:
; AVX512F: # %bb.0:
Modified: llvm/trunk/test/CodeGen/X86/broadcastm-lowering.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/broadcastm-lowering.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/broadcastm-lowering.ll (original)
+++ llvm/trunk/test/CodeGen/X86/broadcastm-lowering.ll Tue Jan 9 08:26:06 2018
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw| FileCheck %s --check-prefix=ALL --check-prefix=AVX512VLCDBW
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw| FileCheck %s --check-prefix=ALL --check-prefix=X86-AVX512VLCDBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512VLCDBW
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,X86-AVX512VLCDBW
define <2 x i64> @test_mm_epi64(<8 x i16> %a, <8 x i16> %b) {
; AVX512CD-LABEL: test_mm_epi64:
@@ -28,7 +28,7 @@ define <2 x i64> @test_mm_epi64(<8 x i16
; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax
; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax
; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0
-; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0
+; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
; X86-AVX512VLCDBW-NEXT: retl
entry:
%0 = icmp eq <8 x i16> %a, %b
@@ -122,7 +122,7 @@ define <8 x i64> @test_mm512_epi64(<8 x
; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax
; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax
; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0
-; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0
+; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
; X86-AVX512VLCDBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X86-AVX512VLCDBW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; X86-AVX512VLCDBW-NEXT: retl
@@ -160,7 +160,7 @@ define <4 x i64> @test_mm256_epi64(<8 x
; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax
; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax
; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0
-; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0
+; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
; X86-AVX512VLCDBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X86-AVX512VLCDBW-NEXT: retl
entry:
Modified: llvm/trunk/test/CodeGen/X86/combine-shl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-shl.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-shl.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-shl.ll Tue Jan 9 08:26:06 2018
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX-FAST
; fold (shl 0, x) -> 0
define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
@@ -113,14 +114,23 @@ define <4 x i32> @combine_vec_shl_trunc_
; SSE-NEXT: pmulld %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: combine_vec_shl_trunc_and:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX-SLOW-LABEL: combine_vec_shl_trunc_and:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: vzeroupper
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: combine_vec_shl_trunc_and:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-FAST-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; AVX-FAST-NEXT: vzeroupper
+; AVX-FAST-NEXT: retq
%1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
%2 = trunc <4 x i64> %1 to <4 x i32>
%3 = shl <4 x i32> %x, %2
Modified: llvm/trunk/test/CodeGen/X86/combine-sra.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-sra.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-sra.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-sra.ll Tue Jan 9 08:26:06 2018
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
; fold (sra 0, x) -> 0
define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) {
@@ -180,14 +181,23 @@ define <4 x i32> @combine_vec_ashr_trunc
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
; SSE-NEXT: retq
;
-; AVX-LABEL: combine_vec_ashr_trunc_and:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: combine_vec_ashr_trunc_and:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
%1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
%2 = trunc <4 x i64> %1 to <4 x i32>
%3 = ashr <4 x i32> %x, %2
@@ -213,14 +223,23 @@ define <4 x i32> @combine_vec_ashr_trunc
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: combine_vec_ashr_trunc_lshr:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: combine_vec_ashr_trunc_lshr:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
%1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
%2 = trunc <4 x i64> %1 to <4 x i32>
%3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
@@ -247,13 +266,21 @@ define <4 x i32> @combine_vec_ashr_trunc
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: combine_vec_ashr_trunc_ashr:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
-; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: combine_vec_ashr_trunc_ashr:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
%1 = ashr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
%2 = trunc <4 x i64> %1 to <4 x i32>
%3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
Modified: llvm/trunk/test/CodeGen/X86/combine-srl.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/combine-srl.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/combine-srl.ll (original)
+++ llvm/trunk/test/CodeGen/X86/combine-srl.ll Tue Jan 9 08:26:06 2018
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
; fold (srl 0, x) -> 0
define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
@@ -215,14 +216,23 @@ define <4 x i32> @combine_vec_lshr_trunc
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE-NEXT: retq
;
-; AVX-LABEL: combine_vec_lshr_trunc_lshr1:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
-; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: combine_vec_lshr_trunc_lshr1:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
%1 = lshr <4 x i64> %x, <i64 32, i64 33, i64 34, i64 35>
%2 = trunc <4 x i64> %1 to <4 x i32>
%3 = lshr <4 x i32> %2, <i32 16, i32 17, i32 18, i32 19>
@@ -446,14 +456,23 @@ define <4 x i32> @combine_vec_lshr_trunc
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
; SSE-NEXT: retq
;
-; AVX-LABEL: combine_vec_lshr_trunc_and:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: combine_vec_lshr_trunc_and:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
%1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
%2 = trunc <4 x i64> %1 to <4 x i32>
%3 = lshr <4 x i32> %x, %2
Modified: llvm/trunk/test/CodeGen/X86/insertelement-zero.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/insertelement-zero.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/insertelement-zero.ll (original)
+++ llvm/trunk/test/CodeGen/X86/insertelement-zero.ll Tue Jan 9 08:26:06 2018
@@ -4,7 +4,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
; SSE2-LABEL: insert_v2f64_z1:
@@ -429,12 +430,24 @@ define <16 x i8> @insert_v16i8_z12345678
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: insert_v16i8_z123456789ABCDEz:
-; AVX: # %bb.0:
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
-; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: insert_v16i8_z123456789ABCDEz:
+; AVX1: # %bb.0:
+; AVX1-NEXT: xorl %eax, %eax
+; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: insert_v16i8_z123456789ABCDEz:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: xorl %eax, %eax
+; AVX2-SLOW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: insert_v16i8_z123456789ABCDEz:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: retq
%1 = insertelement <16 x i8> %a, i8 0, i32 0
%2 = insertelement <16 x i8> %1, i8 0, i32 15
ret <16 x i8> %2
@@ -479,16 +492,25 @@ define <32 x i8> @insert_v32i8_z12345678
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
-; AVX2: # %bb.0:
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: xorl %eax, %eax
+; AVX2-SLOW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
+; AVX2-SLOW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: retq
%1 = insertelement <32 x i8> %a, i8 0, i32 0
%2 = insertelement <32 x i8> %1, i8 0, i32 15
%3 = insertelement <32 x i8> %2, i8 0, i32 30
Modified: llvm/trunk/test/CodeGen/X86/oddshuffles.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/oddshuffles.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/oddshuffles.ll (original)
+++ llvm/trunk/test/CodeGen/X86/oddshuffles.ll Tue Jan 9 08:26:06 2018
@@ -2,7 +2,8 @@
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefix=XOP
define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind {
@@ -154,16 +155,36 @@ define void @v5i16(<4 x i16> %a, <4 x i1
; SSE42-NEXT: movq %xmm2, (%rdi)
; SSE42-NEXT: retq
;
-; AVX-LABEL: v5i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX-NEXT: vpextrw $6, %xmm0, 8(%rdi)
-; AVX-NEXT: vmovq %xmm1, (%rdi)
-; AVX-NEXT: retq
+; AVX1-LABEL: v5i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX1-NEXT: vpextrw $6, %xmm0, 8(%rdi)
+; AVX1-NEXT: vmovq %xmm1, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: v5i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX2-SLOW-NEXT: vpextrw $6, %xmm0, 8(%rdi)
+; AVX2-SLOW-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: v5i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,4,5,12,13,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX2-FAST-NEXT: vpextrw $6, %xmm0, 8(%rdi)
+; AVX2-FAST-NEXT: vmovq %xmm1, (%rdi)
+; AVX2-FAST-NEXT: retq
;
; XOP-LABEL: v5i16:
; XOP: # %bb.0:
@@ -550,18 +571,30 @@ define void @v12i16(<8 x i16> %a, <8 x i
; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
; AVX1-NEXT: retq
;
-; AVX2-LABEL: v12i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
-; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
-; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: v12i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX2-SLOW-NEXT: vmovq %xmm2, 16(%rdi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: v12i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm2
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,14,15,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX2-FAST-NEXT: vmovq %xmm0, 16(%rdi)
+; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rdi)
+; AVX2-FAST-NEXT: retq
;
; XOP-LABEL: v12i16:
; XOP: # %bb.0:
@@ -637,20 +670,35 @@ define void @v12i32(<8 x i32> %a, <8 x i
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: v12i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
-; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT: vmovaps %ymm0, (%rdi)
-; AVX2-NEXT: vmovaps %xmm2, 32(%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: v12i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
+; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm0
+; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi)
+; AVX2-SLOW-NEXT: vmovaps %xmm2, 32(%rdi)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: v12i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm3
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [3,3,7,7,7,7,6,7]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm0
+; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX2-FAST-NEXT: vmovaps %xmm0, 32(%rdi)
+; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdi)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; XOP-LABEL: v12i32:
; XOP: # %bb.0:
@@ -1400,34 +1448,63 @@ define void @interleave_24i32_out(<24 x
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: interleave_24i32_out:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovups (%rdi), %ymm0
-; AVX2-NEXT: vmovups 32(%rdi), %ymm1
-; AVX2-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
-; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
-; AVX2-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
-; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
-; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm5
-; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
-; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT: vmovups %ymm3, (%rsi)
-; AVX2-NEXT: vmovups %ymm4, (%rdx)
-; AVX2-NEXT: vmovups %ymm0, (%rcx)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: interleave_24i32_out:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0
+; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1
+; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
+; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
+; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
+; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
+; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsi)
+; AVX2-SLOW-NEXT: vmovups %ymm4, (%rdx)
+; AVX2-SLOW-NEXT: vmovups %ymm0, (%rcx)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: interleave_24i32_out:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovups (%rdi), %ymm0
+; AVX2-FAST-NEXT: vmovups 32(%rdi), %ymm1
+; AVX2-FAST-NEXT: vmovups 64(%rdi), %ymm2
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
+; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm3
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
+; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
+; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm4
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
+; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = [0,1,0,3,0,1,4,7]
+; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FAST-NEXT: vmovups %ymm3, (%rsi)
+; AVX2-FAST-NEXT: vmovups %ymm4, (%rdx)
+; AVX2-FAST-NEXT: vmovups %ymm0, (%rcx)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; XOP-LABEL: interleave_24i32_out:
; XOP: # %bb.0:
@@ -1603,33 +1680,61 @@ define void @interleave_24i32_in(<24 x i
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: interleave_24i32_in:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovups (%rsi), %ymm0
-; AVX2-NEXT: vmovups (%rdx), %ymm1
-; AVX2-NEXT: vmovups (%rcx), %ymm2
-; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
-; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
-; AVX2-NEXT: vbroadcastsd %xmm2, %ymm4
-; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
-; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm5
-; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX2-NEXT: vmovups %ymm4, 64(%rdi)
-; AVX2-NEXT: vmovups %ymm3, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: interleave_24i32_in:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovups (%rsi), %ymm0
+; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1
+; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
+; AVX2-SLOW-NEXT: vbroadcastsd %xmm2, %ymm4
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
+; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm5
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX2-SLOW-NEXT: vmovups %ymm4, 64(%rdi)
+; AVX2-SLOW-NEXT: vmovups %ymm3, (%rdi)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: interleave_24i32_in:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovups (%rsi), %ymm0
+; AVX2-FAST-NEXT: vmovups (%rdx), %ymm1
+; AVX2-FAST-NEXT: vmovups (%rcx), %ymm2
+; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
+; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
+; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
+; AVX2-FAST-NEXT: vbroadcastsd %xmm2, %ymm4
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
+; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2]
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7]
+; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7]
+; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
+; AVX2-FAST-NEXT: vbroadcastsd 24(%rsi), %ymm2
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX2-FAST-NEXT: vmovups %ymm1, 64(%rdi)
+; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX2-FAST-NEXT: vmovups %ymm3, (%rdi)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; XOP-LABEL: interleave_24i32_in:
; XOP: # %bb.0:
Modified: llvm/trunk/test/CodeGen/X86/psubus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/psubus.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/psubus.ll (original)
+++ llvm/trunk/test/CodeGen/X86/psubus.ll Tue Jan 9 08:26:06 2018
@@ -2,9 +2,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512
define <8 x i16> @test1(<8 x i16> %x) nounwind {
; SSE-LABEL: test1:
@@ -1872,32 +1873,58 @@ define <8 x i16> @psubus_8i64_max(<8 x i
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: psubus_8i64_max:
-; AVX2: # %bb.0: # %vector.ph
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5
-; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm6
-; AVX2-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm6
-; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm4
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: psubus_8i64_max:
+; AVX2-SLOW: # %bb.0: # %vector.ph
+; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm2, %ymm5
+; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm6
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5
+; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm1, %ymm6
+; AVX2-SLOW-NEXT: vpor %ymm4, %ymm3, %ymm4
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4
+; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm3, %ymm1
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: psubus_8i64_max:
+; AVX2-FAST: # %bb.0: # %vector.ph
+; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-FAST-NEXT: vpxor %ymm4, %ymm2, %ymm5
+; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm6
+; AVX2-FAST-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5
+; AVX2-FAST-NEXT: vpxor %ymm4, %ymm1, %ymm6
+; AVX2-FAST-NEXT: vpor %ymm4, %ymm3, %ymm4
+; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4
+; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3
+; AVX2-FAST-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm3, %ymm1
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: psubus_8i64_max:
; AVX512: # %bb.0: # %vector.ph
Modified: llvm/trunk/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-of-splat-multiuses.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-of-splat-multiuses.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-of-splat-multiuses.ll Tue Jan 9 08:26:06 2018
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST
; PR32449
define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind {
@@ -27,12 +28,19 @@ define <4 x double> @foo4(<4 x double> %
}
define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind {
-; AVX2-LABEL: foo8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX2-NEXT: vmovaps %ymm0, (%rdi)
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: foo8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: foo8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vbroadcastss {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdi)
+; AVX2-FAST-NEXT: retq
%res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
%res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> <i32 2, i32 0, i32 undef, i32 undef, i32 5, i32 1, i32 3, i32 7>
store <8 x float> %res, <8 x float>* %p
Modified: llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-128.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-128.ll Tue Jan 9 08:26:06 2018
@@ -5,9 +5,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind {
; SSE2-LABEL: shuffle_v16i8_to_v8i8_1:
@@ -400,8 +400,8 @@ define void @shuffle_v8i16_to_v2i16_1(<8
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
@@ -460,8 +460,8 @@ define void @shuffle_v8i16_to_v2i16_2(<8
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
@@ -520,8 +520,8 @@ define void @shuffle_v8i16_to_v2i16_3(<8
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-256.ll Tue Jan 9 08:26:06 2018
@@ -3,9 +3,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
@@ -405,10 +405,9 @@ define void @shuffle_v16i16_to_v4i16_1(<
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
@@ -418,10 +417,9 @@ define void @shuffle_v16i16_to_v4i16_1(<
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
@@ -431,10 +429,9 @@ define void @shuffle_v16i16_to_v4i16_1(<
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@@ -510,10 +507,9 @@ define void @shuffle_v16i16_to_v4i16_2(<
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
@@ -589,10 +585,9 @@ define void @shuffle_v16i16_to_v4i16_3(<
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
@@ -602,10 +597,9 @@ define void @shuffle_v16i16_to_v4i16_3(<
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
@@ -615,10 +609,9 @@ define void @shuffle_v16i16_to_v4i16_3(<
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@@ -748,10 +741,9 @@ define void @shuffle_v32i8_to_v4i8_2(<32
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
@@ -773,10 +765,9 @@ define void @shuffle_v32i8_to_v4i8_2(<32
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@@ -1056,10 +1047,9 @@ define void @shuffle_v32i8_to_v4i8_6(<32
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
@@ -1081,10 +1071,9 @@ define void @shuffle_v32i8_to_v4i8_6(<32
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
Modified: llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-512.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-strided-with-offset-512.ll Tue Jan 9 08:26:06 2018
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1:
@@ -23,9 +23,9 @@ define void @shuffle_v64i8_to_v32i8_1(<6
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
+; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@@ -47,9 +47,9 @@ define void @shuffle_v64i8_to_v32i8_1(<6
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
+; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -77,9 +77,9 @@ define void @shuffle_v32i16_to_v16i16_1(
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
+; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@@ -111,15 +111,45 @@ define void @shuffle_v32i16_to_v16i16_1(
}
define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
-; AVX512-LABEL: shuffle_v16i32_to_v8i32_1:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovaps (%rdi), %zmm0
-; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps (%rdi), %zmm0
+; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i32_to_v8i32_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa32 (%rdi), %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
+; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovaps (%rdi), %zmm0
+; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa32 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
+; AVX512BWVL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %L
%strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
store <8 x i32> %strided.vec, <8 x i32>* %S
@@ -399,16 +429,14 @@ define void @shuffle_v32i16_to_v8i16_1(<
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
@@ -420,16 +448,14 @@ define void @shuffle_v32i16_to_v8i16_1(<
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
@@ -478,16 +504,14 @@ define void @shuffle_v32i16_to_v8i16_2(<
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
@@ -499,16 +523,14 @@ define void @shuffle_v32i16_to_v8i16_2(<
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
@@ -557,16 +579,14 @@ define void @shuffle_v32i16_to_v8i16_3(<
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
@@ -578,16 +598,14 @@ define void @shuffle_v32i16_to_v8i16_3(<
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128.ll Tue Jan 9 08:26:06 2018
@@ -5,9 +5,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
; PR31551
; Pairs of shufflevector:trunc functions with functional equivalence.
@@ -473,8 +473,8 @@ define void @shuffle_v8i16_to_v2i16(<8 x
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
@@ -533,8 +533,8 @@ define void @trunc_v2i64_to_v2i16(<8 x i
;
; AVX512BW-LABEL: trunc_v2i64_to_v2i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256.ll Tue Jan 9 08:26:06 2018
@@ -3,9 +3,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
; PR31551
; Pairs of shufflevector:trunc functions with functional equivalence.
@@ -505,10 +505,9 @@ define void @shuffle_v16i16_to_v4i16(<16
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll Tue Jan 9 08:26:06 2018
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
; PR31551
; Pairs of shufflevector:trunc functions with functional equivalence.
@@ -27,9 +27,9 @@ define void @shuffle_v64i8_to_v32i8(<64
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
+; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@@ -51,9 +51,9 @@ define void @shuffle_v64i8_to_v32i8(<64
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
+; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -120,13 +120,14 @@ define void @shuffle_v32i16_to_v16i16(<3
;
; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-NEXT: vmovaps %ymm0, (%rsi)
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
+; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
+; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@@ -134,10 +135,9 @@ define void @shuffle_v32i16_to_v16i16(<3
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
+; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
@@ -174,15 +174,45 @@ define void @trunc_v16i32_to_v16i16(<32
}
define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
-; AVX512-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovaps (%rdi), %zmm0
-; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps (%rdi), %zmm0
+; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa32 (%rdi), %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
+; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovaps (%rdi), %zmm0
+; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa32 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
+; AVX512BWVL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %L
%strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
store <8 x i32> %strided.vec, <8 x i32>* %S
@@ -326,16 +356,14 @@ define void @shuffle_v32i16_to_v8i16(<32
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
@@ -347,16 +375,14 @@ define void @shuffle_v32i16_to_v8i16(<32
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
Modified: llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-half-conversions.ll Tue Jan 9 08:26:06 2018
@@ -1,8 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL
;
; Half to Float
@@ -1189,22 +1190,38 @@ define <2 x double> @cvt_2i16_to_2f64(<2
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: cvt_2i16_to_2f64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: movswl %ax, %ecx
-; AVX2-NEXT: shrl $16, %eax
-; AVX2-NEXT: cwtl
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vmovd %ecx, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: cvt_2i16_to_2f64:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX2-SLOW-NEXT: movswl %ax, %ecx
+; AVX2-SLOW-NEXT: shrl $16, %eax
+; AVX2-SLOW-NEXT: cwtl
+; AVX2-SLOW-NEXT: vmovd %eax, %xmm0
+; AVX2-SLOW-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vmovd %ecx, %xmm1
+; AVX2-SLOW-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: cvt_2i16_to_2f64:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FAST-NEXT: vmovd %xmm0, %eax
+; AVX2-FAST-NEXT: movswl %ax, %ecx
+; AVX2-FAST-NEXT: shrl $16, %eax
+; AVX2-FAST-NEXT: cwtl
+; AVX2-FAST-NEXT: vmovd %eax, %xmm0
+; AVX2-FAST-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovd %ecx, %xmm1
+; AVX2-FAST-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX2-FAST-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: cvt_2i16_to_2f64:
; AVX512F: # %bb.0:
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll Tue Jan 9 08:26:06 2018
@@ -4,8 +4,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL
define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_0001:
@@ -1238,18 +1239,28 @@ define <4 x i32> @shuffle_v4i32_z4zz(<4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v4i32_z4zz:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: shuffle_v4i32_z4zz:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v4i32_z4zz:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_z4zz:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
ret <4 x i32> %shuffle
@@ -1284,18 +1295,28 @@ define <4 x i32> @shuffle_v4i32_zz4z(<4
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: shuffle_v4i32_zz4z:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: shuffle_v4i32_zz4z:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v4i32_zz4z:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
+; AVX2-FAST-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_zz4z:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
ret <4 x i32> %shuffle
@@ -1351,12 +1372,22 @@ define <4 x i32> @shuffle_v4i32_z6zz(<4
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v4i32_z6zz:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: shuffle_v4i32_z6zz:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-FAST-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4i32_z6zz:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
ret <4 x i32> %shuffle
}
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll Tue Jan 9 08:26:06 2018
@@ -3,7 +3,8 @@
; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
;
; Verify that the DAG combiner correctly folds bitwise operations across
; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
@@ -2536,12 +2537,19 @@ define <8 x i32> @combine_unneeded_subve
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: combine_unneeded_subvector1:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: combine_unneeded_subvector1:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: combine_unneeded_subvector1:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: retq
%b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
%c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %c
@@ -2873,12 +2881,19 @@ define <8 x float> @PR22412(<8 x float>
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: PR22412:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: PR22412:
+; AVX2-SLOW: # %bb.0: # %entry
+; AVX2-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: PR22412:
+; AVX2-FAST: # %bb.0: # %entry
+; AVX2-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: retq
entry:
%s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll Tue Jan 9 08:26:06 2018
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=VL_BW_DQ
define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
; AVX512F-LABEL: shuf2i1_1_0:
@@ -309,10 +309,10 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7]
-; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,10,3,0,1,2,3]
+; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0
; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0
; AVX512VL-NEXT: kmovw %k0, %eax
; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
@@ -324,10 +324,10 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %
; VL_BW_DQ-NEXT: kmovd %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7]
-; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
+; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,10,3,0,1,2,3]
+; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll Tue Jan 9 08:26:06 2018
@@ -1,10 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ
;
; add
@@ -28,14 +29,23 @@ define <4 x i32> @trunc_add_v4i64_v4i32(
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_add_v4i64_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_add_v4i64_v4i32:
; AVX512: # %bb.0:
@@ -90,20 +100,34 @@ define <8 x i16> @trunc_add_v8i64_v8i16(
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_add_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_add_v8i64_v8i16:
; AVX512: # %bb.0:
@@ -228,33 +252,58 @@ define <16 x i8> @trunc_add_v16i64_v16i8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_add_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpaddq %ymm5, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpaddq %ymm4, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-FAST-NEXT: vpaddq %ymm6, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_add_v16i64_v16i8:
; AVX512: # %bb.0:
@@ -467,13 +516,21 @@ define <4 x i32> @trunc_add_const_v4i64_
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_add_const_v4i64_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
; AVX512: # %bb.0:
@@ -520,18 +577,30 @@ define <8 x i16> @trunc_add_const_v8i64_
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
; AVX512: # %bb.0:
@@ -634,30 +703,52 @@ define <16 x i8> @trunc_add_const_v16i64
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512: # %bb.0:
@@ -811,14 +902,23 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_sub_v4i64_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_sub_v4i64_v4i32:
; AVX512: # %bb.0:
@@ -873,20 +973,34 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_sub_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_sub_v8i64_v8i16:
; AVX512: # %bb.0:
@@ -1011,33 +1125,58 @@ define <16 x i8> @trunc_sub_v16i64_v16i8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_sub_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpsubq %ymm5, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpsubq %ymm4, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-FAST-NEXT: vpsubq %ymm6, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_sub_v16i64_v16i8:
; AVX512: # %bb.0:
@@ -1209,14 +1348,23 @@ define <4 x i32> @trunc_sub_const_v4i64_
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_sub_const_v4i64_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
; AVX512: # %bb.0:
@@ -1276,20 +1424,34 @@ define <8 x i16> @trunc_sub_const_v8i64_
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_sub_const_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
; AVX512: # %bb.0:
@@ -1415,33 +1577,58 @@ define <16 x i8> @trunc_sub_const_v16i64
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3
+; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX512: # %bb.0:
@@ -1623,15 +1810,24 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_mul_v4i64_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
; AVX512F: # %bb.0:
@@ -1720,26 +1916,44 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_mul_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
; AVX512F: # %bb.0:
@@ -2028,41 +2242,70 @@ define <16 x i8> @trunc_mul_v16i64_v16i8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_mul_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vpmulld %xmm7, %xmm3, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpmulld %xmm6, %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpmulld %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpmulld %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vpmulld %xmm7, %xmm3, %xmm3
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3
+; AVX2-FAST-NEXT: vpmulld %xmm7, %xmm3, %xmm3
+; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2
+; AVX2-FAST-NEXT: vpmulld %xmm6, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1
+; AVX2-FAST-NEXT: vpmulld %xmm5, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm4
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0
+; AVX2-FAST-NEXT: vpmulld %xmm4, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
; AVX512F: # %bb.0:
@@ -2340,13 +2583,21 @@ define <4 x i32> @trunc_mul_const_v4i64_
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_mul_const_v4i64_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
; AVX512: # %bb.0:
@@ -2393,18 +2644,30 @@ define <8 x i16> @trunc_mul_const_v8i64_
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_mul_const_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
; AVX512: # %bb.0:
@@ -2613,33 +2876,58 @@ define <16 x i8> @trunc_mul_const_v16i64
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512: # %bb.0:
@@ -2829,14 +3117,23 @@ define <4 x i32> @trunc_and_v4i64_v4i32(
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_and_v4i64_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_and_v4i64_v4i32:
; AVX512: # %bb.0:
@@ -2887,20 +3184,34 @@ define <8 x i16> @trunc_and_v8i64_v8i16(
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_and_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_and_v8i64_v8i16:
; AVX512: # %bb.0:
@@ -3015,33 +3326,58 @@ define <16 x i8> @trunc_and_v16i64_v16i8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_and_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX2-FAST-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_and_v16i64_v16i8:
; AVX512: # %bb.0:
@@ -3199,13 +3535,21 @@ define <4 x i32> @trunc_and_const_v4i64_
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_and_const_v4i64_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
; AVX512: # %bb.0:
@@ -3252,18 +3596,30 @@ define <8 x i16> @trunc_and_const_v8i64_
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_and_const_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
; AVX512: # %bb.0:
@@ -3366,30 +3722,52 @@ define <16 x i8> @trunc_and_const_v16i64
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
; AVX512: # %bb.0:
@@ -3541,14 +3919,23 @@ define <4 x i32> @trunc_xor_v4i64_v4i32(
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_xor_v4i64_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_xor_v4i64_v4i32:
; AVX512: # %bb.0:
@@ -3599,20 +3986,34 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_xor_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_xor_v8i64_v8i16:
; AVX512: # %bb.0:
@@ -3727,33 +4128,58 @@ define <16 x i8> @trunc_xor_v16i64_v16i8
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_xor_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpxor %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpxor %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpxor %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpxor %ymm5, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpxor %ymm7, %ymm3, %ymm3
+; AVX2-FAST-NEXT: vpxor %ymm6, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_xor_v16i64_v16i8:
; AVX512: # %bb.0:
@@ -3911,13 +4337,21 @@ define <4 x i32> @trunc_xor_const_v4i64_
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_xor_const_v4i64_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
; AVX512: # %bb.0:
@@ -3964,18 +4398,30 @@ define <8 x i16> @trunc_xor_const_v8i64_
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_xor_const_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
; AVX512: # %bb.0:
@@ -4078,30 +4524,52 @@ define <16 x i8> @trunc_xor_const_v16i64
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX512: # %bb.0:
@@ -4253,14 +4721,23 @@ define <4 x i32> @trunc_or_v4i64_v4i32(<
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_or_v4i64_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_or_v4i64_v4i32:
; AVX512: # %bb.0:
@@ -4311,20 +4788,34 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_or_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_or_v8i64_v8i16:
; AVX512: # %bb.0:
@@ -4439,33 +4930,58 @@ define <16 x i8> @trunc_or_v16i64_v16i8(
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_or_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpor %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpor %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpor %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpor %ymm7, %ymm3, %ymm3
+; AVX2-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_or_v16i64_v16i8:
; AVX512: # %bb.0:
@@ -4623,13 +5139,21 @@ define <4 x i32> @trunc_or_const_v4i64_v
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_or_const_v4i64_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
; AVX512: # %bb.0:
@@ -4676,18 +5200,30 @@ define <8 x i16> @trunc_or_const_v8i64_v
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_or_const_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
; AVX512: # %bb.0:
@@ -4790,30 +5326,52 @@ define <16 x i8> @trunc_or_const_v16i64_
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
+; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
; AVX512: # %bb.0:
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc.ll Tue Jan 9 08:26:06 2018
@@ -3,11 +3,12 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
; SSE-LABEL: trunc8i64_8i32:
@@ -26,14 +27,22 @@ define <8 x i32> @trunc8i64_8i32(<8 x i6
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc8i64_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc8i64_8i32:
+; AVX2-SLOW: # %bb.0: # %entry
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc8i64_8i32:
+; AVX2-FAST: # %bb.0: # %entry
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i32:
; AVX512: # %bb.0: # %entry
@@ -103,14 +112,22 @@ define <8 x i32> @trunc8i64_8i32_ashr(<8
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc8i64_8i32_ashr:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
+; AVX2-SLOW: # %bb.0: # %entry
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc8i64_8i32_ashr:
+; AVX2-FAST: # %bb.0: # %entry
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i32_ashr:
; AVX512: # %bb.0: # %entry
@@ -148,16 +165,26 @@ define <8 x i32> @trunc8i64_8i32_lshr(<8
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc8i64_8i32_lshr:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
+; AVX2-SLOW: # %bb.0: # %entry
+; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
+; AVX2-FAST: # %bb.0: # %entry
+; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i32_lshr:
; AVX512: # %bb.0: # %entry
@@ -228,18 +255,30 @@ define <8 x i16> @trunc8i64_8i16(<8 x i6
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc8i64_8i16:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc8i64_8i16:
+; AVX2-SLOW: # %bb.0: # %entry
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc8i64_8i16:
+; AVX2-FAST: # %bb.0: # %entry
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i16:
; AVX512: # %bb.0: # %entry
@@ -283,19 +322,32 @@ define void @trunc8i64_8i8(<8 x i64> %a)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc8i64_8i8:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc8i64_8i8:
+; AVX2-SLOW: # %bb.0: # %entry
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc8i64_8i8:
+; AVX2-FAST: # %bb.0: # %entry
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT: vmovq %xmm0, (%rax)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i8:
; AVX512: # %bb.0: # %entry
@@ -1368,14 +1420,22 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc2x4i64_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
+; AVX2-SLOW: # %bb.0: # %entry
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc2x4i64_8i32:
+; AVX2-FAST: # %bb.0: # %entry
+; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc2x4i64_8i32:
; AVX512F: # %bb.0: # %entry
@@ -1474,18 +1534,30 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc2x4i64_8i16:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc2x4i64_8i16:
+; AVX2-SLOW: # %bb.0: # %entry
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc2x4i64_8i16:
+; AVX2-FAST: # %bb.0: # %entry
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc2x4i64_8i16:
; AVX512F: # %bb.0: # %entry
@@ -1882,14 +1954,22 @@ define <8 x i16> @PR32160(<8 x i32> %x)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX2-LABEL: PR32160:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: PR32160:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: PR32160:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: PR32160:
; AVX512F: # %bb.0:
@@ -1903,8 +1983,7 @@ define <8 x i16> @PR32160(<8 x i32> %x)
; AVX512VL-LABEL: PR32160:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
@@ -1912,16 +1991,14 @@ define <8 x i16> @PR32160(<8 x i32> %x)
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: PR32160:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%shuf = trunc <8 x i32> %x to <8 x i16>
Modified: llvm/trunk/test/CodeGen/X86/vector-zext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-zext.ll?rev=322090&r1=322089&r2=322090&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-zext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-zext.ll Tue Jan 9 08:26:06 2018
@@ -6,7 +6,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_8i16:
@@ -1929,11 +1929,16 @@ define <4 x i32> @shuf_zext_8i16_to_4i32
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero
; AVX2-FAST-NEXT: retq
;
-; AVX512-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuf_zext_8i16_to_4i32_offset1:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero
+; AVX512BW-NEXT: retq
entry:
%B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8>
%Z = bitcast <8 x i16> %B to <4 x i32>
More information about the llvm-commits
mailing list