[llvm] [DAG] visitEXTRACT_SUBVECTOR - don't return early on failure of EXTRACT_SUBVECTOR(INSERT_SUBVECTOR()) -> BITCAST fold (PR #133695)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 31 03:35:53 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Always allow later folds to try to match as well.
---
Patch is 68.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133695.diff
5 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+17-19)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll (+4-8)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll (+98-102)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (+8-16)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+186-194)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4487b9d510cc7..dc5c5f38e3bd8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -25532,26 +25532,24 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
// Handle only simple case where vector being inserted and vector
// being extracted are of same size.
EVT SmallVT = V.getOperand(1).getValueType();
- if (!NVT.bitsEq(SmallVT))
- return SDValue();
-
- // Combine:
- // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
- // Into:
- // indices are equal or bit offsets are equal => V1
- // otherwise => (extract_subvec V1, ExtIdx)
- uint64_t InsIdx = V.getConstantOperandVal(2);
- if (InsIdx * SmallVT.getScalarSizeInBits() ==
- ExtIdx * NVT.getScalarSizeInBits()) {
- if (LegalOperations && !TLI.isOperationLegal(ISD::BITCAST, NVT))
- return SDValue();
-
- return DAG.getBitcast(NVT, V.getOperand(1));
+ if (NVT.bitsEq(SmallVT)) {
+ // Combine:
+ // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
+ // Into:
+ // indices are equal or bit offsets are equal => V1
+ // otherwise => (extract_subvec V1, ExtIdx)
+ uint64_t InsIdx = V.getConstantOperandVal(2);
+ if (InsIdx * SmallVT.getScalarSizeInBits() ==
+ ExtIdx * NVT.getScalarSizeInBits()) {
+ if (!LegalOperations || TLI.isOperationLegal(ISD::BITCAST, NVT))
+ return DAG.getBitcast(NVT, V.getOperand(1));
+ } else {
+ return DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, DL, NVT,
+ DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
+ N->getOperand(1));
+ }
}
- return DAG.getNode(
- ISD::EXTRACT_SUBVECTOR, DL, NVT,
- DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
- N->getOperand(1));
}
if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG, LegalOperations))
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 0df63422b5d84..e4fa594f3dd72 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -449,9 +449,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
; AVX512-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512-NEXT: vmovdqa %ymm1, (%r9)
+; AVX512-NEXT: vmovdqa %ymm2, (%r9)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -476,9 +475,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
; AVX512-FCP-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512-FCP-NEXT: vmovdqa %ymm1, (%r9)
+; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -504,9 +502,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
; AVX512DQ-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%r9)
+; AVX512DQ-NEXT: vmovdqa %ymm2, (%r9)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -531,9 +528,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
; AVX512DQ-FCP-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index bc08f57e5faac..e4e013446f7a5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -1380,29 +1380,28 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10
; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9))
-; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm4
-; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u],zero,zero,zero,zero,ymm2[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[20,21,28,29,u,u,u,u]
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u]
+; AVX512-NEXT: vpor %ymm2, %ymm7, %ymm2
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2))
+; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm2
+; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
; AVX512-NEXT: vpsrld $16, %xmm6, %xmm1
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512-NEXT: vpbroadcastd 12(%r10), %xmm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6],xmm3[7]
+; AVX512-NEXT: vpbroadcastd 12(%r10), %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u]
-; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1))
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
; AVX512-NEXT: vmovdqa %xmm0, 96(%rax)
-; AVX512-NEXT: vmovdqa %ymm1, 64(%rax)
+; AVX512-NEXT: vmovdqa %ymm8, 64(%rax)
; AVX512-NEXT: vmovdqa64 %zmm11, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1420,6 +1419,37 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7
; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,1,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,2,0]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,5,0,0,5,2,6,0]
+; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm11
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[20,21,24,25]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
+; AVX512-FCP-NEXT: vporq %zmm9, %zmm10, %zmm9
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
+; AVX512-FCP-NEXT: vpbroadcastd (%r10), %ymm11
+; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u],zero,zero,zero,zero,ymm7[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[20,21,28,29,u,u,u,u]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm6[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm6[u,u,u,u]
+; AVX512-FCP-NEXT: vpor %ymm7, %ymm6, %ymm6
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6))
; AVX512-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3
; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -1430,41 +1460,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpbroadcastd 12(%r10), %xmm2
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,3,1,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,3,1,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u]
-; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1))
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,1,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,2,0]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,8,9],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,28,29,20,21]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,0,0,5,2,6,0]
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[20,21,24,25]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
-; AVX512-FCP-NEXT: vporq %zmm2, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,2,0,2]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm3[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm3[u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,4,5,12,13],zero,zero,ymm3[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm3[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512-FCP-NEXT: vpbroadcastd (%r10), %ymm4
-; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & mem) | zmm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2))
; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
-; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
+; AVX512-FCP-NEXT: vmovdqa %ymm8, 64(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -1505,29 +1503,28 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9))
-; AVX512DQ-NEXT: vpsrlq $48, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u],zero,zero,zero,zero,ymm2[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[20,21,28,29,u,u,u,u]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u]
+; AVX512DQ-NEXT: vpor %ymm2, %ymm7, %ymm2
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2))
+; AVX512DQ-NEXT: vpsrlq $48, %xmm4, %xmm2
+; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
; AVX512DQ-NEXT: vpsrld $16, %xmm6, %xmm1
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512DQ-NEXT: vpbroadcastd 12(%r10), %xmm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6],xmm3[7]
+; AVX512DQ-NEXT: vpbroadcastd 12(%r10), %xmm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u]
-; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1))
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax)
-; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax)
+; AVX512DQ-NEXT: vmovdqa %ymm8, 64(%rax)
; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -1545,6 +1542,37 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,1,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,2,0]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,5,0,0,5,2,6,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[20,21,24,25]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
+; AVX512DQ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/133695
More information about the llvm-commits
mailing list