[llvm] [X86] combineX86ShuffleChainWithExtract - ensure subvector widening is at index 0 (PR #143009)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 5 10:21:19 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
When peeking through insert_subvector(undef,sub,c) widening patterns we didn't ensure c == 0
Fixes #<!-- -->142995
---
Patch is 60.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143009.diff
3 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+2-1)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll (+202-200)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll (+92)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 760119bc62604..bbb9c797202e6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40663,7 +40663,8 @@ static SDValue combineX86ShuffleChainWithExtract(
continue;
}
if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Input.getOperand(0).isUndef()) {
+ Input.getOperand(0).isUndef() &&
+ isNullConstant(Input.getOperand(2))) {
Input = peekThroughBitcasts(Input.getOperand(1));
continue;
}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 86c932a5bb1f9..c132c5ea2ef49 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -13736,11 +13736,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm7
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm4
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm6
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm10
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122
@@ -13748,58 +13748,58 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
; AVX512BW-FCP-NEXT: kmovq %k1, %k2
; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm1 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1}
; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9
; AVX512BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm5 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm4 {%k1}
; AVX512BW-FCP-NEXT: kmovq %k1, %k3
; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6]
; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
-; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
+; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
; AVX512BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4
; AVX512BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000
; AVX512BW-FCP-NEXT: kmovq %rax, %k5
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm1 {%k5}
-; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6
-; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm5
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5}
+; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm5
+; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4
; AVX512BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448
; AVX512BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm8 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm8, %xmm21
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80
; AVX512BW-FCP-NEXT: kmovd %eax, %k7
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm21 {%k7}
-; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7
-; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm4
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7}
+; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm8
+; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm6
; AVX512BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224
; AVX512BW-FCP-NEXT: kmovd %eax, %k4
-; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm18 {%k4}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm18 {%k4}
; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
@@ -13811,10 +13811,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF
; AVX512BW-FCP-NEXT: kmovd %r10d, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm7 {%k1}
; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18
@@ -13829,7 +13829,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm7 {%k5}
; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
@@ -13886,7 +13886,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
; AVX512BW-FCP-NEXT: kmovq %rax, %k2
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k4}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k4}
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
@@ -13894,7 +13894,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm15 {%k1}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
@@ -13902,15 +13902,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm7 {%k2}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k6}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm15 {%k6}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
@@ -13919,14 +13919,14 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k6}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k6}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm15 {%k4}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm15 {%k4}
; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
@@ -13940,14 +13940,14 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16
; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm14 {%k2}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm14 {%k2}
; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm16 {%k4}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k4}
; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
@@ -13957,14 +13957,14 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1}
; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm16 {%k1}
; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k6}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm16 {%k6}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
@@ -14002,83 +14002,84 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm20
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
-; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k3}
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20
+; AVX512BW-FCP-NEXT: vporq %xmm9, %xmm17, %xmm9
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm20 {%k3}
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm17
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm18
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm9
; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0]
-; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20
-; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; A...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/143009
More information about the llvm-commits
mailing list