[llvm] [X86] Reuse X86ISD::SUBV_BROADCAST_LOAD for subvector loads across chains (PR #142381)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 2 05:59:39 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Improve handling of folding a (small) vector load that is also loaded as a X86ISD::SUBV_BROADCAST_LOAD node to just (freely) extract the bottom subvector - similar to #<!-- -->139575 we should be checking the SUBV_BROADCAST_LOAD has uses of the loaded value, and not that the out chain is empty to ensure its actually used, we also must call makeEquivalentMemoryOrdering to ensure the out chains are correctly merged to handle any aliasing with later load/stores.
This PR is a little messy as it has 2 other inter-dependent changes to avoid regressions - now that we're properly merging subvector loads, we can drop the oneuse limit on the "vperm2x128(load(p),undef) -> broadcast128(p+offset)" and "insert_subvector(load256(p),load128(p),0) -> broadcast128(p)" folds.
---
Patch is 629.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142381.diff
13 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+4-5)
- (modified) llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/oddshuffles.ll (+46-48)
- (modified) llvm/test/CodeGen/X86/vector-interleave.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll (+290-307)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll (+663-698)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll (+1047-1140)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll (+86-87)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-2.ll (+67-67)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll (+540-556)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll (+856-890)
- (modified) llvm/test/CodeGen/X86/x86-interleaved-access.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll (+34-37)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 71c699a318eb1..69db9c13ce753 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42672,7 +42672,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
- X86::mayFoldLoad(LHS, Subtarget)) {
+ X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
MVT MemVT = VT.getHalfNumVectorElementsVT();
unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL, VT, MemVT,
@@ -53143,9 +53143,10 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
- !User->hasAnyUseOfValue(1) &&
+ User->hasAnyUseOfValue(0) &&
User->getValueSizeInBits(0).getFixedValue() >
RegVT.getFixedSizeInBits()) {
+ DAG.makeEquivalentMemoryOrdering(SDValue(N, 1), SDValue(User, 1));
SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
RegVT.getSizeInBits());
Extract = DAG.getBitcast(RegVT, Extract);
@@ -59441,10 +59442,8 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
// If we're splatting the lower half subvector of a full vector load into the
// upper half, just splat the subvector directly, potentially creating a
// subvector broadcast.
- // TODO: Drop hasOneUse checks.
if ((int)IdxVal == (VecNumElts / 2) &&
- Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits()) &&
- (Vec.hasOneUse() || SubVec.hasOneUse())) {
+ Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
auto *VecLd = dyn_cast<LoadSDNode>(Vec);
auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
if (VecLd && SubLd &&
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index f2e4da0ac5400..7fcca526e460c 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3367,7 +3367,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[1,3],ymm1[4,4],ymm0[5,7]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,1,0,1]
+; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 8fd8e0e8120c1..edc8404993996 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1683,33 +1683,32 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
;
; AVX1-LABEL: interleave_24i32_in:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovupd (%rcx), %ymm0
-; AVX1-NEXT: vmovups (%rdx), %xmm1
-; AVX1-NEXT: vmovups 16(%rdx), %xmm2
-; AVX1-NEXT: vmovups (%rsi), %xmm3
-; AVX1-NEXT: vmovups 16(%rsi), %xmm4
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm2[3,3]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3]
-; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm1[1]
-; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
+; AVX1-NEXT: vmovups (%rdx), %xmm0
+; AVX1-NEXT: vmovups 16(%rdx), %xmm1
+; AVX1-NEXT: vmovups (%rsi), %xmm2
+; AVX1-NEXT: vmovups 16(%rsi), %xmm3
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3],xmm1[3,3]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,2]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT: vbroadcastsd (%rcx), %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7]
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = mem[1,0,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7]
-; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX1-NEXT: vmovups %ymm1, (%rdi)
-; AVX1-NEXT: vmovups %ymm2, 64(%rdi)
+; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm0[1]
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1],xmm3[0,2]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastsd (%rcx), %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = mem[0,0,3,3,4,4,7,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,0,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2]
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
+; AVX1-NEXT: vmovups %ymm2, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vmovups %ymm1, 64(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1804,30 +1803,29 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; XOP-NEXT: vmovups (%rsi), %ymm0
; XOP-NEXT: vmovups (%rdx), %ymm1
; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[u,3],ymm1[3],ymm0[u,4],ymm1[4],ymm0[u,5]
-; XOP-NEXT: vmovups (%rcx), %ymm1
-; XOP-NEXT: vmovups (%rdx), %xmm2
-; XOP-NEXT: vmovups 16(%rdx), %xmm3
-; XOP-NEXT: vmovups (%rsi), %xmm4
-; XOP-NEXT: vmovups 16(%rsi), %xmm5
-; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm3[3,3]
-; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
-; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[0,2]
-; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
-; XOP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,2,3]
-; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,3,3]
-; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
-; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm4[1],xmm2[1]
-; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2]
-; XOP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1]
+; XOP-NEXT: vmovups (%rdx), %xmm1
+; XOP-NEXT: vmovups 16(%rdx), %xmm2
+; XOP-NEXT: vmovups (%rsi), %xmm3
+; XOP-NEXT: vmovups 16(%rsi), %xmm4
+; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm2[3,3]
+; XOP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
+; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2]
; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; XOP-NEXT: vbroadcastsd (%rcx), %ymm4
-; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
-; XOP-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
-; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
+; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
+; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
+; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
+; XOP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm1[1]
+; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
+; XOP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
+; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; XOP-NEXT: vbroadcastsd (%rcx), %ymm3
+; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
+; XOP-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2]
+; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
; XOP-NEXT: vmovups %ymm0, 32(%rdi)
-; XOP-NEXT: vmovups %ymm2, (%rdi)
-; XOP-NEXT: vmovups %ymm3, 64(%rdi)
+; XOP-NEXT: vmovups %ymm1, (%rdi)
+; XOP-NEXT: vmovups %ymm2, 64(%rdi)
; XOP-NEXT: vzeroupper
; XOP-NEXT: retq
%s1 = load <8 x i32>, ptr %q1, align 4
diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll
index 206f7ed43fd6d..aec3140c73fb2 100644
--- a/llvm/test/CodeGen/X86/vector-interleave.ll
+++ b/llvm/test/CodeGen/X86/vector-interleave.ll
@@ -576,12 +576,12 @@ define void @splat2_i64(ptr %s, ptr %d) {
;
; AVX1-LABEL: splat2_i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
+; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
-; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
+; AVX1-NEXT: vbroadcastf128 16(%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
-; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
-; AVX1-NEXT: vmovupd %ymm1, (%rsi)
+; AVX1-NEXT: vmovupd %ymm1, 32(%rsi)
+; AVX1-NEXT: vmovupd %ymm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
index ac0522a9e68f4..39230b67d380f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
@@ -458,33 +458,32 @@ define void @store_i32_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX-LABEL: store_i32_stride3_vf8:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd (%rdx), %ymm0
-; AVX-NEXT: vmovaps (%rsi), %xmm1
-; AVX-NEXT: vmovaps 16(%rsi), %xmm2
-; AVX-NEXT: vmovaps (%rdi), %xmm3
-; AVX-NEXT: vmovaps 16(%rdi), %xmm4
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm3[1],xmm1[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm1[1,1],xmm5[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
-; AVX-NEXT: vbroadcastsd (%rdx), %ymm3
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3],xmm2[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
-; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
-; AVX-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7]
-; AVX-NEXT: vpermilpd {{.*#+}} ymm4 = mem[1,0,2,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7]
-; AVX-NEXT: vmovaps %ymm0, 32(%rcx)
-; AVX-NEXT: vmovaps %ymm2, 64(%rcx)
-; AVX-NEXT: vmovaps %ymm1, (%rcx)
+; AVX-NEXT: vmovaps (%rsi), %xmm0
+; AVX-NEXT: vmovaps 16(%rsi), %xmm1
+; AVX-NEXT: vmovaps (%rdi), %xmm2
+; AVX-NEXT: vmovaps 16(%rdi), %xmm3
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm0[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,1],xmm4[0,2]
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX-NEXT: vbroadcastsd (%rdx), %ymm2
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3],xmm1[3,3]
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,2]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0,0,3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
+; AVX-NEXT: vpermilps {{.*#+}} ymm2 = mem[0,0,3,3,4,4,7,7]
+; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,0,2,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
+; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
+; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
+; AVX-NEXT: vmovaps %ymm0, (%rcx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -780,60 +779,58 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX-LABEL: store_i32_stride3_vf16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd (%rdx), %ymm1
-; AVX-NEXT: vmovapd 32(%rdx), %ymm0
-; AVX-NEXT: vmovaps (%rsi), %xmm2
-; AVX-NEXT: vmovaps 16(%rsi), %xmm3
-; AVX-NEXT: vmovaps 32(%rsi), %xmm4
-; AVX-NEXT: vmovaps 48(%rsi), %xmm5
-; AVX-NEXT: vmovaps (%rdi), %xmm6
-; AVX-NEXT: vmovaps 16(%rdi), %xmm7
-; AVX-NEXT: vmovaps 32(%rdi), %xmm8
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm6[1],xmm2[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm2[1,1],xmm9[0,2]
+; AVX-NEXT: vmovaps (%rsi), %xmm0
+; AVX-NEXT: vmovaps 16(%rsi), %xmm1
+; AVX-NEXT: vmovaps 32(%rsi), %xmm2
+; AVX-NEXT: vmovaps 48(%rsi), %xmm3
+; AVX-NEXT: vmovaps (%rdi), %xmm4
+; AVX-NEXT: vmovaps 16(%rdi), %xmm5
+; AVX-NEXT: vmovaps 32(%rdi), %xmm6
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm0[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm0[1,1],xmm7[0,2]
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,1]
+; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; AVX-NEXT: vbroadcastsd (%rdx), %ymm4
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
+; AVX-NEXT: vmovaps 48(%rdi), %xmm4
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm4[3,3],xmm3[3,3]
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[0,2]
+; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm2[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm2[1,1],xmm4[0,2]
; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0]
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[2,1]
-; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2
-; AVX-NEXT: vbroadcastsd (%rdx), %ymm6
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7]
-; AVX-NEXT: vmovaps 48(%rdi), %xmm6
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm6[3,3],xmm5[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,1],xmm6[0,2]
-; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3,2,3]
-; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm4[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1],xmm6[0,2]
-; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm8[2,1]
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm6
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3],xmm3[3,3]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm7[1],xmm3[1]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm7[0,2]
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3,2,3]
-; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0,0,3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5,6],ymm6[7]
-; AVX-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7]
-; AVX-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5],ymm1[6],ymm6[7]
-; AVX-NEXT: vpermilps {{.*#+}} ymm6 = mem[0,0,3,3,4,4,7,7]
-; AVX-NEXT: vpermilpd {{.*#+}} ymm7 = mem[1,0,2,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7]
-; AVX-NEXT: vmovaps %ymm0, 128(%rcx)
-; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
-; AVX-NEXT: vmovaps %ymm3, 64(%rcx)
-; AVX-NEXT: vmovaps %ymm4, 96(%rcx)
-; AVX-NEXT: vmovaps %ymm5, 160(%rcx)
-; AVX-NEXT: vmovaps %ymm2, (%rcx)
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX-NEXT: vbroadcastsd 32(%rdx), %ymm4
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,3],xmm1[3,3]
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm5[0,2]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7]
+; AVX-NEXT: vpermilps {{.*#+}} ymm4 = mem[0,0,3,3,4,4,7,7]
+; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = mem[1,0,2,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX-NEXT: vpermilpd {{.*#+}} ymm5 = mem[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX-NEXT: vpermilps {{.*#+}} ymm5 = mem[0,0,3,3,4,4,7,7]
+; AVX-NEXT: vpermilpd {{.*#+}} ymm6 = mem[1,0,2,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
+; AVX-NEXT: vpermilpd {{.*#+}} ymm6 = mem[1,1,2,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
+; AVX-NEXT: vmovaps %ymm5, 128(%rcx)
+; AVX-NEXT: vmovaps %ymm4, 32(%rcx)
+; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
+; AVX-NEXT: vmovaps %ymm2, 96(%rcx)
+; AVX-NEXT: vmovaps %ymm3, 160(%rcx)
+; AVX-NEXT: vmovaps %ymm0, (%rcx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -1378,114 +1375,110 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX-LABEL: store_i32_stride3_vf32:
; AVX: # %bb.0:
-; AVX-NEXT: vmovapd (%rdx), %ymm4
-; AVX-NEXT: vmovapd 32(%rdx), %ymm2
-; AVX-NEXT: vmovapd 64(%rdx), %ymm3
-; AVX-NEXT: vmovapd 96(%rdx), %ymm0
-; AVX-NEXT: vmovaps (%rsi), %xmm1
-; AVX-NEXT: vmovaps 16(%rsi), %xmm7
-; AVX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/142381
More information about the llvm-commits
mailing list