[llvm] [X86] Extend shuf128(concat(x,y),concat(z,w)) -> shuf128(widen(y),widen(w)) folds to peek through bitcasts (PR #129896)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 5 08:12:34 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Peek through bitcasts when looking for freely accessible upper subvectors
---
Patch is 214.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/129896.diff
5 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+8-6)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll (+40-76)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll (+68-76)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll (+648-650)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll (+24-32)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 40bac0c20035b..4a16f286fffaf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42365,21 +42365,23 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
// If we're permuting the upper 256-bits subvectors of a concatenation, then
// see if we can peek through and access the subvector directly.
if (VT.is512BitVector()) {
- // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
- // upper subvector is used.
+ // 512-bit mask uses 4 x i2 indices - if the msb is always set then only
+ // the upper subvector is used.
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
uint64_t Mask = N->getConstantOperandVal(2);
SmallVector<SDValue> LHSOps, RHSOps;
SDValue NewLHS, NewRHS;
- if ((Mask & 0x0A) == 0x0A &&
- collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
+ if (collectConcatOps(peekThroughBitcasts(LHS).getNode(), LHSOps, DAG) &&
+ LHSOps.size() == 2 && (Mask & 0x0A) == 0x0A) {
NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
+ NewLHS = DAG.getBitcast(VT, NewLHS);
Mask &= ~0x0A;
}
- if ((Mask & 0xA0) == 0xA0 &&
- collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
+ if (collectConcatOps(peekThroughBitcasts(RHS).getNode(), RHSOps, DAG) &&
+ RHSOps.size() == 2 && (Mask & 0xA0) == 0xA0) {
NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
+ NewRHS = DAG.getBitcast(VT, NewRHS);
Mask &= ~0xA0;
}
if (NewLHS || NewRHS)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 4a2e7d55d3e88..2b6ff9ba2789b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -1666,7 +1666,6 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm6
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6
@@ -1674,7 +1673,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
; AVX512-NEXT: vpermd (%rdx), %zmm4, %zmm5
; AVX512-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
@@ -1693,8 +1692,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3]
; AVX512-NEXT: vmovdqa (%rdx), %ymm7
; AVX512-NEXT: vmovdqa 32(%rdx), %ymm8
; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
@@ -1708,14 +1706,13 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vmovdqa (%rsi), %ymm6
; AVX512-NEXT: vpshufb %ymm9, %ymm6, %ymm6
; AVX512-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
; AVX512-NEXT: vprold $16, %xmm0, %xmm6
; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
; AVX512-NEXT: vpshufb %ymm10, %ymm7, %ymm1
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
; AVX512-NEXT: vpermd %ymm7, %ymm2, %ymm2
@@ -1744,7 +1741,6 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm6
; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
@@ -1752,7 +1748,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
; AVX512-FCP-NEXT: vpermd (%rdx), %zmm4, %zmm5
; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
@@ -1771,8 +1767,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3]
; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm7
; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm8
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
@@ -1786,14 +1781,13 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm6
; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6
; AVX512-FCP-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
; AVX512-FCP-NEXT: vprold $16, %xmm0, %xmm6
; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm1
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
; AVX512-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2
@@ -1822,7 +1816,6 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm6
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm6
@@ -1830,7 +1823,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
; AVX512DQ-NEXT: vpermd (%rdx), %zmm4, %zmm5
; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
@@ -1849,8 +1842,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[4,5,6,7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3]
; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm7
; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm8
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
@@ -1864,14 +1856,13 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm6
; AVX512DQ-NEXT: vpshufb %ymm9, %ymm6, %ymm6
; AVX512DQ-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
; AVX512DQ-NEXT: vprold $16, %xmm0, %xmm6
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
; AVX512DQ-NEXT: vpshufb %ymm10, %ymm7, %ymm1
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
; AVX512DQ-NEXT: vpermd %ymm7, %ymm2, %ymm2
@@ -1900,7 +1891,6 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm6
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
@@ -1908,7 +1898,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
; AVX512DQ-FCP-NEXT: vpermd (%rdx), %zmm4, %zmm5
; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
@@ -1927,8 +1917,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3]
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm7
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm8
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
@@ -1942,14 +1931,13 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm6
; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6
; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
; AVX512DQ-FCP-NEXT: vprold $16, %xmm0, %xmm6
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm1
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2
@@ -3091,8 +3079,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vmovdqa (%rsi), %ymm1
; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3
+; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm3
; AVX512-NEXT: vmovdqa (%rsi), %xmm5
; AVX512-NEXT: vmovdqa64 16(%rsi), %xmm20
; AVX512-NEXT: vmovdqa64 32(%rsi), %xmm24
@@ -3106,7 +3093,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm9
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3]
; AVX512-NEXT: vmovdqa (%rdx), %ymm3
; AVX512-NEXT: vmovdqa 32(%rdx), %ymm8
; AVX512-NEXT: vmovdqa 64(%rdx), %ymm14
@@ -3126,7 +3113,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm10
; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm12
; AVX512-NEXT: vmovdqa 80(%rsi), %xmm13
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
@@ -3136,7 +3122,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2]
; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3]
; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
; AVX512-NEXT: vpermd 64(%rdx), %zmm18, %zmm10
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
@@ -3156,8 +3142,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
; AVX512-NEXT: vmovdqa 96(%rdx), %ymm5
; AVX512-NEXT: vpermd %ymm5, %ymm18, %ymm7
; AVX512-NEXT: vpandnq %ymm7, %ymm22, %ymm7
@@ -3170,7 +3155,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vmovdqa 64(%rsi), %ymm7
; AVX512-NEXT: vpshufb %ymm13, %ymm7, %ymm7
; AVX512-NEXT: vpor %ymm0, %ymm7, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa 64(%rsi), %xmm7
; AVX512-NEXT: vprold $16, %xmm7, %xmm12
; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5
@@ -3179,7 +3163,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm5
; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3]
; AVX512-NEXT: vpshufb %ymm9, %ymm14, %ymm5
; AVX512-NEXT: vpermd %ymm14, %ymm16, %ymm6
; AVX512-NEXT: vpandn %ymm6, %ymm15, %ymm6
@@ -3198,8 +3182,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6
-; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3]
; AVX512-NEXT: vpermd %ymm8, %ymm18, %ymm6
; AVX512-NEXT: vpandnq %ymm6, %ymm22, %ymm6
; AVX512-NEXT: vpshufb %ymm9, %ymm8, %ymm7
@@ -3212,7 +3195,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %xmm20, %xmm4
; AVX512-NEXT: vmovdqa64 %xmm21, %xmm1
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
@@ -3221,7 +3203,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/129896
More information about the llvm-commits
mailing list