[llvm] [X86] combineINSERT_SUBVECTOR - peek through bitcasts to find a concatenation of subvector shuffles (PR #131331)

via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 14 06:18:58 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

<details>
<summary>Changes</summary>

Extend existing concat(shuffle(),shuffle(),..) shuffle combining to handle mixed bitcasts such as concat(shuffle(),bitcast(shuffle()),...)

---

Patch is 1.30 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131331.diff


8 Files Affected:

- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+1-1) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll (+1630-1686) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll (+995-1053) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (+592-630) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll (+1364-1464) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+2750-2770) 
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll (+232-504) 
- (modified) llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll (+2-4) 


``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ac39c8fc0cba3..f02609a62425b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58818,7 +58818,7 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
 
     // Attempt to recursively combine to a shuffle.
     if (all_of(SubVectorOps, [](SDValue SubOp) {
-          return isTargetShuffle(SubOp.getOpcode());
+          return isTargetShuffle(peekThroughBitcasts(SubOp).getOpcode());
         })) {
       SDValue Op(N, 0);
       if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index fe567b90b3bcd..0df63422b5d84 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -3342,629 +3342,615 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride5_vf32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm9
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm12
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm10
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
-; AVX512-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
-; AVX512-NEXT:    vmovdqa64 (%rdx), %ymm17
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm10
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm9
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm11
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm23
+; AVX512-NEXT:    vmovdqa64 (%rdx), %ymm18
 ; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm5
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm17[3,2,3,3,7,6,7,7]
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm1
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4],ymm3[5,6,7,8],ymm2[9],ymm3[10],ymm2[11,12],ymm3[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,0,1]
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm14
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm8
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm15
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm18[3,2,3,3,7,6,7,7]
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm7
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3,4],ymm2[5,6,7,8],ymm1[9],ymm2[10],ymm1[11,12],ymm2[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1]
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm1
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm8
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
 ; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6]
-; AVX512-NEXT:    vmovdqa64 (%rdi), %ymm20
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm20[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm13 = ymm4[2,3,2,3,6,7,6,7]
 ; AVX512-NEXT:    vmovdqa (%rsi), %ymm6
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm7 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2]
-; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm2[0,1,0,1]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm16 & (zmm3 ^ zmm0))
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,2,6,7,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10],ymm13[11],ymm14[12,13],ymm13[14],ymm14[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,2]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],zmm2[0,1,0,1]
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm13 = zmm0 ^ (zmm17 & (zmm13 ^ zmm0))
 ; AVX512-NEXT:    vpbroadcastq 24(%r8), %ymm0
 ; AVX512-NEXT:    vpbroadcastq 32(%r8), %ymm2
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm19
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm3))
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm13))
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm14
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
-; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm0
+; AVX512-NEXT:    vpshufb %ymm2, %ymm14, %ymm0
+; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm21
+; AVX512-NEXT:    vmovdqa64 32(%rdi), %ymm16
+; AVX512-NEXT:    vpermq {{.*#+}} ymm13 = ymm16[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5],ymm0[6],ymm13[7,8],ymm0[9],ymm13[10,11],ymm0[12],ymm13[13],ymm0[14],ymm13[15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
+; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
+; AVX512-NEXT:    vpbroadcastq 40(%rdi), %xmm15
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm13, %zmm1, %zmm13
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm15
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
+; AVX512-NEXT:    vpshufb %ymm2, %ymm15, %ymm1
 ; AVX512-NEXT:    vmovdqa64 %ymm2, %ymm22
-; AVX512-NEXT:    vmovdqa64 32(%rdi), %ymm18
-; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm18[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13],ymm0[14],ymm7[15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
-; AVX512-NEXT:    vpshufb %xmm7, %xmm8, %xmm8
-; AVX512-NEXT:    vpbroadcastq 40(%rdi), %xmm11
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm11
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
-; AVX512-NEXT:    vpshufb %ymm4, %ymm0, %ymm2
-; AVX512-NEXT:    vmovdqa64 %ymm4, %ymm23
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm5[3,0,3,0,7,4,7,4]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm8 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
-; AVX512-NEXT:    vpshufb %xmm8, %xmm10, %xmm10
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm9, %zmm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm11))
-; AVX512-NEXT:    vmovdqa (%r8), %ymm10
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm9
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128]
-; AVX512-NEXT:    vpshufb %ymm11, %ymm9, %ymm4
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,1]
-; AVX512-NEXT:    vpandnq %ymm9, %ymm21, %ymm9
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm9, %zmm9
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm2 & zmm21)
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm5[3,0,3,0,7,4,7,4]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0],ymm1[1],ymm12[2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7,8],ymm1[9],ymm12[10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm11
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5],xmm10[6],xmm11[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm12
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm13))
+; AVX512-NEXT:    vmovdqa (%r8), %ymm11
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm10
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm13 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %ymm13, %ymm10, %ymm2
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,1]
+; AVX512-NEXT:    vpandnq %ymm10, %ymm20, %ymm10
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm10, %zmm10
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm12 & zmm20)
+; AVX512-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-NEXT:    vpbroadcastq 8(%rdi), %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; AVX512-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1]
 ; AVX512-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
-; AVX512-NEXT:    vpshufb %xmm13, %xmm4, %xmm4
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX512-NEXT:    vpshufb %xmm8, %xmm12, %xmm8
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm8
+; AVX512-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
+; AVX512-NEXT:    vpshufb %xmm1, %xmm9, %xmm1
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm4, %zmm2
-; AVX512-NEXT:    vpshufb %xmm7, %xmm14, %xmm4
-; AVX512-NEXT:    vpbroadcastq 8(%rdi), %xmm7
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2,3],xmm7[4],xmm4[5],xmm7[6],xmm4[7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
-; AVX512-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6]
-; AVX512-NEXT:    vinserti32x4 $2, %xmm4, %zmm7, %zmm4
-; AVX512-NEXT:    vpermq {{.*#+}} zmm4 = zmm4[0,1,0,1,4,5,4,5]
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm7 & (zmm4 ^ zmm2))
-; AVX512-NEXT:    vpbroadcastq (%r8), %ymm2
-; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm10[0,1,1,1]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4))
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5]
-; AVX512-NEXT:    vprolq $16, %ymm3, %ymm8
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm18[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,2,6,7,6,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4,5],ymm8[6],ymm3[7,8],ymm8[9],ymm3[10],ymm8[11],ymm3[12,13],ymm8[14],ymm3[15]
+; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm0))
+; AVX512-NEXT:    vpbroadcastq (%r8), %ymm0
+; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm11[0,1,1,1]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm1 = ymm16[0,1,2,1,4,5,6,5]
+; AVX512-NEXT:    vprolq $16, %ymm14, %ymm3
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm16[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm14[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,2,6,7,6,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8],ymm3[9],ymm8[10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm4
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25]
 ; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm0, %ymm8
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm12 = ymm5[1,1,1,2,5,5,5,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5],ymm12[6],ymm8[7,8],ymm12[9],ymm8[10,11],ymm12[12],ymm8[13],ymm12[14],ymm8[15]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm15, %ymm8
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm5[1,1,1,2,5,5,5,6]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[3,2,3,3,7,6,7,7]
-; AVX512-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3,4],ymm0[5,6,7,8],ymm5[9],ymm0[10],ymm5[11,12],ymm0[13,14,15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm7 & (zmm0 ^ zmm4))
-; AVX512-NEXT:    vpbroadcastq 48(%r8), %ymm4
-; AVX512-NEXT:    vpbroadcastq 56(%r8), %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm0))
-; AVX512-NEXT:    vmovdqa64 %ymm22, %ymm0
-; AVX512-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm20[1,1,2,2]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5],ymm0[6],ymm5[7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13],ymm0[14],ymm5[15]
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm5 = ymm20[0,1,2,1,4,5,6,5]
-; AVX512-NEXT:    vprolq $16, %ymm6, %ymm6
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512-NEXT:    vmovdqa64 %ymm23, %ymm5
-; AVX512-NEXT:    vpshufb %ymm5, %ymm1, %ymm5
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm6 = ymm17[3,0,3,0,7,4,7,4]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15]
-; AVX512-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
-; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm17[1,1,1,2,5,5,5,6]
-; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
-; AVX512-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm5, %zmm1
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm16 & (zmm1 ^ zmm0))
-; AVX512-NEXT:    vpshufb %ymm11, %ymm10, %ymm0
-; AVX512-NEXT:    vpbroadcastq 16(%r8), %ymm3
-; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm1 & mem)
-; AVX512-NEXT:    vmovdqa64 %zmm0, 64(%r9)
-; AVX512-NEXT:    vmovdqa64 %zmm4, 256(%r9)
-; AVX512-NEXT:    vmovdqa64 %zmm2, (%r9)
-; AVX512-NEXT:    vmovdqa64 %zmm9, 192(%r9)
+; AVX512-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3,4],ymm9[5,6,7,8],ymm5[9],ymm9[10],ymm5[11,12],ymm9[13,14,15]
+; AVX512-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2]
+; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm8, %zmm5
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm2 & (zmm5 ^ zmm1))
+; AVX512-NEXT:    vpbroadcastq 48(%r8), %ymm1
+; AVX512-NEXT:    vpbroadcastq 56(%r8), %ymm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm5))
+; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,1,4,5,6,5]
+; AVX512-NEXT:    vprolq $16, %ymm6, %ymm5
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3],ymm2[4],ymm5[5,6],ymm2[7],ymm5[8,9],ymm2[10],ymm5[11],ymm2[12],ymm5[13,14],ymm2[15]
+; AVX512-NEXT:    vmovdqa64 %ymm21, %ymm5
+; AVX512-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[1,1,2,2]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+; AVX512-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm2[2,3,2,3]
+; AVX512-NEXT:    vpshufb %ymm3, %ymm7,...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/131331


More information about the llvm-commits mailing list