[llvm-branch-commits] [llvm] release/21.x: [X86] lowerShuffleAsLanePermuteAndPermute - ensure we've simplified the demanded shuffle mask elts before testing for a matching shuffle (#153554) (PR #154299)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Aug 19 02:32:12 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: None (llvmbot)
<details>
<summary>Changes</summary>
Backport c96d0da62b7ae83b613fc2b9c134cc42402ae2c4
Requested by: @<!-- -->RKSimon
---
Patch is 30.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154299.diff
2 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+6-6)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll (+263)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c7839baf7de8e..7e87808b6ccc4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15403,18 +15403,18 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
return SDValue();
}
- // Avoid returning the same shuffle operation. For example,
- // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
- // undef:v16i16
- if (CrossLaneMask == Mask || InLaneMask == Mask)
- return SDValue();
-
// Simplify CrossLaneMask based on the actual demanded elements.
if (V1.hasOneUse())
for (int i = 0; i != NumElts; ++i)
if (!DemandedCrossLane[i])
CrossLaneMask[i] = SM_SentinelUndef;
+ // Avoid returning the same shuffle operation. For example,
+ // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
+ // undef:v16i16
+ if (CrossLaneMask == Mask || InLaneMask == Mask)
+ return SDValue();
+
SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
InLaneMask);
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 87c135ddcec95..ef20cf2a09bba 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -1724,6 +1724,269 @@ define void @PR54562_mem(ptr %src, ptr %dst) {
ret void
}
+define <512 x i8> @PR153457(<512 x i8> %a0, <512 x i8> %a1) nounwind {
+; AVX512F-LABEL: PR153457:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-64, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm7
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm7
+; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm9
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm8
+; AVX512F-NEXT: vextracti32x4 $2, %zmm5, %xmm7
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm9
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; AVX512F-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512F-NEXT: vpor %xmm11, %xmm9, %xmm9
+; AVX512F-NEXT: vpshufb %xmm10, %xmm1, %xmm10
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1]
+; AVX512F-NEXT: vpor %xmm11, %xmm10, %xmm10
+; AVX512F-NEXT: vpslld $24, %xmm0, %xmm11
+; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovdqa 16(%rbp), %xmm11
+; AVX512F-NEXT: vpsrld $16, %xmm11, %xmm12
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm12[0]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero
+; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-NEXT: vpsrld $24, %xmm11, %xmm8
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm8
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-NEXT: vinserti128 $1, %xmm11, %ymm9, %ymm8
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,21,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4
+; AVX512F-NEXT: vpsrlq $48, %xmm11, %xmm8
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5
+; AVX512F-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,1,2,0]
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpbroadcastb 16(%rbp), %ymm8
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-NEXT: vpsrlq $56, %xmm11, %xmm7
+; AVX512F-NEXT: vmovdqa %ymm7, 416(%rdi)
+; AVX512F-NEXT: vmovdqa %ymm6, 384(%rdi)
+; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdi)
+; AVX512F-NEXT: vmovdqa64 %zmm5, 320(%rdi)
+; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rdi)
+; AVX512F-NEXT: vmovdqa64 %zmm2, 128(%rdi)
+; AVX512F-NEXT: vmovdqa64 %zmm4, 256(%rdi)
+; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rdi)
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: PR153457:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: pushq %rbp
+; AVX512BW-NEXT: movq %rsp, %rbp
+; AVX512BW-NEXT: andq $-64, %rsp
+; AVX512BW-NEXT: subq $64, %rsp
+; AVX512BW-NEXT: movq %rdi, %rax
+; AVX512BW-NEXT: vmovdqa64 16(%rbp), %zmm7
+; AVX512BW-NEXT: vpbroadcastq %xmm0, %ymm8
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512BW-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm8
+; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm10
+; AVX512BW-NEXT: vpblendvb %ymm9, %ymm8, %ymm10, %ymm8
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm5, %xmm8
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm4, %xmm9
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm9, %xmm9
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512BW-NEXT: vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-NEXT: vinserti32x4 $2, %xmm9, %zmm4, %zmm4
+; AVX512BW-NEXT: vpslld $24, %xmm0, %xmm9
+; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3
+; AVX512BW-NEXT: vpshufb %zmm8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vporq %zmm8, %zmm1, %zmm1
+; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm8
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512BW-NEXT: vpermi2w %zmm7, %zmm2, %zmm8
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-NEXT: vpsrld $24, %xmm7, %xmm3
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vinserti32x4 $3, %xmm7, %zmm4, %zmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,53,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,1,2,0]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpbroadcastb 16(%rbp), %ymm9
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm0[4,5,6,7]
+; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15,16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm6
+; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm3, 384(%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rdi)
+; AVX512BW-NEXT: movq %rbp, %rsp
+; AVX512BW-NEXT: popq %rbp
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: PR153457:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: pushq %rbp
+; AVX512DQ-NEXT: movq %rsp, %rbp
+; AVX512DQ-NEXT: andq $-64, %rsp
+; AVX512DQ-NEXT: subq $64, %rsp
+; AVX512DQ-NEXT: movq %rdi, %rax
+; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm7
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512DQ-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm7
+; AVX512DQ-NEXT: vpbroadcastd %xmm0, %ymm9
+; AVX512DQ-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm8
+; AVX512DQ-NEXT: vextracti32x4 $2, %zmm5, %xmm7
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vextracti32x4 $2, %zmm4, %xmm9
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512DQ-NEXT: vpor %xmm11, %xmm9, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm10
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1]
+; AVX512DQ-NEXT: vpor %xmm11, %xmm10, %xmm10
+; AVX512DQ-NEXT: vpslld $24, %xmm0, %xmm11
+; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa 16(%rbp), %xmm11
+; AVX512DQ-NEXT: vpsrld $16, %xmm11, %xmm12
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm12[0]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero
+; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT: vpsrld $24, %xmm11, %xmm8
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm8
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm9, %ymm8
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,21,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4
+; AVX512DQ-NEXT: vpsrlq $48, %xmm11, %xmm8
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,1,2,0]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpbroadcastb 16(%rbp), %ymm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-NEXT: vpsrlq $56, %xmm11, %xmm7
+; AVX512DQ-NEXT: vmovdqa %ymm7, 416(%rdi)
+; AVX512DQ-NEXT: vmovdqa %ymm6, 384(%rdi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rdi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rdi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rdi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rdi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rdi)
+; AVX512DQ-NEXT: movq %rbp, %rsp
+; AVX512DQ-NEXT: popq %rbp
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+;
+; AVX512VBMI-LABEL: PR153457:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %rbp
+; AVX512VBMI-NEXT: movq %rsp, %rbp
+; AVX512VBMI-NEXT: andq $-64, %rsp
+; AVX512VBMI-NEXT: subq $64, %rsp
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: vmovdqa64 16(%rbp), %zmm7
+; AVX512VBMI-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [32,33,34,35,36,37,38,70,0,0,0,0,0,0,0,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,70,0,0,0,0,0,0,0,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT: vpermi2b %zmm0, %zmm5, %zmm8
+; AVX512VBMI-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,69,0,0,0,0,0,0,0,0,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,69,0,0,0,0,0,0,0,0,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT: vpermi2b %zmm0, %zmm4, %zmm5
+; AVX512VBMI-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,68,0,0,0,0,0,0,0,0,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,68,0,0,0,0,0,0,0,0]
+; AVX512VBMI-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT: vpermi2b %zmm0, %zmm3, %zmm4
+; AVX512VBMI-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,2,3,4,5,6,66,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,67,0,1,2,3,4,5,6,66,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,67]
+; AVX512VBMI-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT: vpermi2b %zmm0, %zmm2, %zmm3
+; AVX512VBMI-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT: vporq %zmm2, %zmm1, %zmm1
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,71]
+; AVX512VBMI-NEXT: vpermi2b %zmm0, %zmm6, %zmm2
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,0,64,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT: vpermi2b %zmm7, %zmm0, %zmm6
+; AVX512VBMI-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT: vpermi2w %zmm7, %zmm3, %zmm0
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm3 = [67,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,68,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT: vpermi2b %zmm7, %zmm4, %zmm3
+; AVX512VBMI-NEXT: vinserti32x4 $3, %xmm7, %zmm5, %zmm4
+; AVX512VBMI-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,53,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15,16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15]
+; AVX512VBMI-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT: vpermi2w %zmm7, %zmm8, %zmm5
+; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2
+; AVX512VBMI-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,65,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT: vpermi2b %zmm7, %zmm1, %zmm8
+; AVX512VBMI-NEXT: vmovdqa64 %zmm5, 320(%rdi)
+; AVX512VBMI-NEXT: vmovdqa64 %zmm4, 256(%rdi)
+; AVX512VBMI-NEXT: vmovdqa64 %zmm3, 192(%rdi)
+; AVX512VBMI-NEXT: vmovdqa64 %zmm0, 128(%rdi)
+; AVX512VBMI-NEXT: vmovdqa64 %zmm8, 64(%rdi)
+; AVX512VBMI-NEXT: vmovdqa64 %zmm6, (%rdi)
+; AVX512VBMI-NEXT: vmovdqa64 %zmm2, 384(%rdi)
+; AVX512VBMI-NEXT: movq %rbp, %rsp
+; AVX512VBMI-NEXT: popq %rbp
+; AVX512VBMI-NEXT: vzeroupper
+; AVX512VBMI-NEXT: retq
+ %shuffle1 = shufflevector <512 x i8> %a0, <512 x i8> zeroinitializer, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 pois...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/154299
More information about the llvm-branch-commits
mailing list