[llvm] [X86] getFauxShuffleMask - add support for vXi64/vXf64 concat_vectors decoding (PR #127630)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 18 05:48:10 PST 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/127630
Similar to insert_subvector - limit this to vXi64 vector cases to make the most of cross lane shuffles (for now).
>From 3ac84c6ef7e3dfd8351e50c40ad0b6a6908fd034 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 18 Feb 2025 13:46:59 +0000
Subject: [PATCH] [X86] getFauxShuffleMask - add support for vXi64/vXf64
concat_vectors decoding
Similar to insert_subvector - limit this to vXi64 vector cases to make the most of cross lane shuffles (for now).
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 13 ++
.../vector-interleaved-store-i8-stride-7.ll | 16 +--
.../vector-interleaved-store-i8-stride-8.ll | 134 +++++++++---------
.../test/CodeGen/X86/vector-shuffle-256-v4.ll | 61 ++++++--
.../CodeGen/X86/zero_extend_vector_inreg.ll | 28 ++--
5 files changed, 150 insertions(+), 102 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 386d56dcda9de..cbd09e9aa1459 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6113,6 +6113,19 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
Ops.push_back(N1);
return true;
}
+ case ISD::CONCAT_VECTORS: {
+ // Limit this to vXi64 vector cases to make the most of cross lane shuffles.
+ unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements();
+ if (NumBitsPerElt == 64) {
+ for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
+ for (unsigned M = 0; M != NumSubElts; ++M)
+ Mask.push_back((I * NumElts) + M);
+ Ops.push_back(N.getOperand(I));
+ }
+ return true;
+ }
+ return false;
+ }
case ISD::INSERT_SUBVECTOR: {
SDValue Src = N.getOperand(0);
SDValue Sub = N.getOperand(1);
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index be83db26aa7ed..89ed0040a71c2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -1215,10 +1215,10 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
-; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vporq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vporq %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
; AVX512BW-NEXT: kmovq %rcx, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
@@ -1294,10 +1294,10 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT: vporq %zmm2, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm2, %zmm1
; AVX512DQ-BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
; AVX512DQ-BW-NEXT: kmovq %rcx, %k1
; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index ba51c65ccab13..dc163bbe3477b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -1161,23 +1161,23 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
-; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
-; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[2,3,0,1,2,3,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
; AVX512BW-NEXT: movl $287445282, %ecx # imm = 0x11221122
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,4,5,6,7]
-; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1]
-; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
+; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1}
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
; AVX512BW-NEXT: movl $1149781128, %ecx # imm = 0x44884488
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1}
; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -1231,23 +1231,23 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[2,3,0,1,2,3,0,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: movl $287445282, %ecx # imm = 0x11221122
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
+; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1}
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
; AVX512DQ-BW-NEXT: movl $1149781128, %ecx # imm = 0x44884488
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1}
; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1}
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -2126,41 +2126,40 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,2,0,2,0,2,0,2]
+; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermq %zmm5, %zmm6, %zmm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
+; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm7
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
; AVX512BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1}
-; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1}
+; AVX512BW-FCP-NEXT: vpermq %zmm4, %zmm6, %zmm4
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
; AVX512BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222
; AVX512BW-FCP-NEXT: kmovd %ecx, %k2
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm4 {%k2}
; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA
; AVX512BW-FCP-NEXT: kmovd %ecx, %k3
-; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3}
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7]
-; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
+; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k3}
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3]
+; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
+; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm2
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
+; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2}
; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3}
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -2231,41 +2230,40 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,2,0,2,0,2,0,2]
+; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermq %zmm5, %zmm6, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermq %zmm4, %zmm6, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm4 {%k2}
; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3
-; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7]
-; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3]
+; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63]
+; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm2
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
+; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3}
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index da65fecba773b..d6208aca3b2b7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1654,12 +1654,27 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v4i64_0044_v2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; AVX512VL-NEXT: retq
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_0044_v2i64:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_0044_v2i64:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
+; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,4]
+; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_0044_v2i64:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
%2 = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> <i32 0, i32 0>
%3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1667,12 +1682,34 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) {
}
define <4 x i64> @shuffle_v4i64_1032_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; ALL-LABEL: shuffle_v4i64_1032_v2i64:
-; ALL: # %bb.0:
-; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v4i64_1032_v2i64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v4i64_1032_v2i64:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_1032_v2i64:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
+; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,5,4]
+; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_1032_v2i64:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
%2 = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> <i32 1, i32 0>
%3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index f7c29cba30bd5..6b1d118ca97ad 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -6173,13 +6173,13 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
-; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
+; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,9,9,0,0,1,1,3]
+; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm2, %zmm0
; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
@@ -6209,13 +6209,13 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
-; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
-; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
+; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,9,9,0,0,1,1,3]
+; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FAST-NEXT: vpandq %zmm0, %zmm2, %zmm0
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-FAST-NEXT: vzeroupper
More information about the llvm-commits
mailing list