[llvm] [X86] splitVector - split concat_vectors(a,b,c,d) -> concat_vectors(a,b) + concat_vectors(c,d) (PR #133753)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 31 10:12:57 PDT 2025
- Previous message: [llvm] [X86] splitVector - split concat_vectors(a,b,c,d) -> concat_vectors(a,b) + concat_vectors(c,d) (PR #133753)
- Next message: [llvm] [X86] splitVector - split concat_vectors(a,b,c,d) -> concat_vectors(a,b) + concat_vectors(c,d) (PR #133753)
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Similar to what we already for build_vectors during subvector extraction, when splitting concat_vectors nodes, attempt to create a pair of half size concat_vectors nodes to see if these can fold.
---
Patch is 252.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133753.diff
13 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+12)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll (+20-20)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll (+30-34)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll (+82-82)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll (+44-46)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll (+84-84)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll (+176-176)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll (+16-20)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (+8-8)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll (+16-18)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+356-352)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll (+240-254)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 76de7e888d985..66b94159c5571 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4326,6 +4326,18 @@ static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
"Can't split odd sized vector");
+ if (Op.getOpcode() == ISD::CONCAT_VECTORS) {
+ assert((Op.getNumOperands() % 2) == 0 &&
+ "Can't split odd sized vector concat");
+ unsigned HalfOps = Op.getNumOperands() / 2;
+ EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+ SmallVector<SDValue, 2> LoOps(Op->op_begin(), Op->op_begin() + HalfOps);
+ SmallVector<SDValue, 2> HiOps(Op->op_begin() + HalfOps, Op->op_end());
+ SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps);
+ SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps);
+ return std::make_pair(Lo, Hi);
+ }
+
// If this is a splat value (with no-undefs) then use the lower subvector,
// which should be a free extraction.
SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
index e4ddf5bc3a8af..d1d7cb0a34332 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
@@ -2410,19 +2410,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3
; AVX512-NEXT: vmovdqa 112(%rdi), %xmm4
; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5
-; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
-; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
-; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
-; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
-; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
; AVX512-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3
+; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
; AVX512-NEXT: vpshufb %ymm6, %ymm4, %ymm4
+; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX512-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
; AVX512-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
@@ -2457,19 +2457,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
-; AVX512-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
; AVX512-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
-; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
-; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
-; AVX512-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
-; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
@@ -2504,19 +2504,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3
; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm4
; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5
-; AVX512DQ-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
; AVX512DQ-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
-; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
-; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3
+; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm4
+; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
@@ -2551,19 +2551,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
; AVX512DQ-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 741f4b80a5ecb..6d1ba933b9082 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -552,23 +552,21 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-LABEL: store_i16_stride3_vf8:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
-; AVX512-FCP-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
-; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,1,1,0,2]
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,1,5,2,6]
-; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5],zero,zero,ymm0[2,3,6,7],zero,zero,ymm0[8,9,12,13],zero,zero,ymm0[18,19,22,23],zero,zero,ymm0[24,25,28,29],zero,zero,ymm0[26,27]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ~mem)
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rcx)
-; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,8,1,9,1,9,2,10]
+; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5],zero,zero,ymm3[2,3,6,7],zero,zero,ymm3[8,9,12,13],zero,zero,ymm3[18,19,22,23],zero,zero,ymm3[24,25,28,29],zero,zero,ymm3[26,27]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2]
+; AVX512-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ~mem) | ymm3
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
+; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -599,23 +597,21 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-LABEL: store_i16_stride3_vf8:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
-; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,1,1,0,2]
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,1,5,2,6]
-; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5],zero,zero,ymm0[2,3,6,7],zero,zero,ymm0[8,9,12,13],zero,zero,ymm0[18,19,22,23],zero,zero,ymm0[24,25,28,29],zero,zero,ymm0[26,27]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ~mem)
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,8,1,9,1,9,2,10]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5],zero,zero,ymm3[2,3,6,7],zero,zero,ymm3[8,9,12,13],zero,zero,ymm3[18,19,22,23],zero,zero,ymm3[24,25,28,29],zero,zero,ymm3[26,27]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2]
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ~mem) | ymm3
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
index 2f6452467a420..fc4377a08d560 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
@@ -513,11 +513,11 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa (%rdx), %xmm1
-; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15]
; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3]
; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542]
; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm4
@@ -536,11 +536,11 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
-; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15]
; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3]
; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542]
; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
@@ -559,11 +559,11 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3]
; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542]
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm4
@@ -582,11 +582,11 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
-; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15]
; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3]
; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542]
; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index e4fa594f3dd72..322d606538c54 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -906,28 +906,28 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa (%rdx), %xmm0
; AVX512-NEXT: vmovdqa (%rcx), %xmm1
; AVX512-NEXT: vmovdqa (%r8), %xmm4
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,1,2,3,5,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6],ymm7[7],ymm8[8,9],ymm7[10,11],ymm8[12,13,14],ymm7[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-NEXT: vpandn %ymm7, %ymm8, %ymm7
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,u,u],zero,zero,zero,zero,ymm6[2,3,18,19,u,u],zero,zero,zero,zero,ymm6[28,29,20,21,u,u],zero,zero
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,3,2,4,6,7,6]
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[1,1,2,3,5,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6],ymm6[7],ymm7[8,9],ymm6[10,11],ymm7[12,13,14],ymm6[15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512-NEXT: vpandn %ymm6, %ymm7, %ymm6
+; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,u,u],zero,zero,zero,zero,ymm5[2,3,18,19,u,u],zero,zero,zero,zero,ymm5[28,29,20,21,u,u],zero,zero
+; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/133753
- Previous message: [llvm] [X86] splitVector - split concat_vectors(a,b,c,d) -> concat_vectors(a,b) + concat_vectors(c,d) (PR #133753)
- Next message: [llvm] [X86] splitVector - split concat_vectors(a,b,c,d) -> concat_vectors(a,b) + concat_vectors(c,d) (PR #133753)
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the llvm-commits
mailing list