[llvm] [X86] Fold BLEND(PERMUTE(X), PERMUTE(Y)) -> PERMUTE(BLEND(X, Y)) (PR #90219)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 26 07:47:53 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
If we don't demand the same element from both single source shuffles (permutes), then attempt to blend the sources together first and then perform a merged permute.
We don't attempt this for vXi16 blends as these are much more likely to involve byte/word vector shuffles that will result in the creation of a lot more variable-mask shuffles (PSHUFB etc.).
This fold might be worth it for VSELECT with constant masks on AVX512 targets, but I haven't investigated this yet.
The PR34592 -O0 regression is an unfortunate failure to cleanup with a later SimplifyDemandedElts pass like the -O3 does - I'm not sure how worried we should be tbh.
---
Patch is 1.66 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90219.diff
13 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+55)
- (modified) llvm/test/CodeGen/X86/horizontal-sum.ll (+8-12)
- (modified) llvm/test/CodeGen/X86/oddshuffles.ll (+28-48)
- (modified) llvm/test/CodeGen/X86/pr34592.ll (+24-22)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll (+849-998)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll (+1791-1881)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll (+3003-3293)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll (+3331-3460)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll (+940-1628)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll (+2-3)
- (modified) llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll (+8-12)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll (+2-4)
- (modified) llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll (+4-7)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f0cec6224e84e4..6080e950d169d5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41753,6 +41753,61 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
KnownUndef = SrcUndef.zextOrTrunc(NumElts);
break;
}
+ case X86ISD::BLENDI: {
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ if (VT.getScalarSizeInBits() < 32 || !N0.hasOneUse() || !N1.hasOneUse())
+ break;
+
+ // Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
+ // iff we don't demand the same element index for both X and Y.
+ SDValue BC0 = peekThroughOneUseBitcasts(N0);
+ SDValue BC1 = peekThroughOneUseBitcasts(N1);
+ SmallVector<SDValue, 2> Ops, Ops0, Ops1;
+ SmallVector<int, 32> Mask, Mask0, Mask1, ScaledMask0, ScaledMask1;
+ if (!getTargetShuffleMask(Op, false, Ops, Mask) ||
+ !getTargetShuffleMask(BC0, false, Ops0, Mask0) ||
+ !getTargetShuffleMask(BC1, false, Ops1, Mask1) ||
+ !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
+ !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
+ break;
+
+ // Determine the demanded elts from both permutes, confirm that we only use
+ // a single operand and that we don't demand the same index from both.
+ APInt Demanded0, DemandedLHS0, DemandedRHS0;
+ APInt Demanded1, DemandedLHS1, DemandedRHS1;
+ if (getShuffleDemandedElts(NumElts, Mask, DemandedElts, Demanded0,
+ Demanded1, /*AllowUndefElts=*/true) &&
+ getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
+ DemandedRHS0, /*AllowUndefElts=*/true) &&
+ getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
+ DemandedRHS1, /*AllowUndefElts=*/true) &&
+ DemandedRHS0.isZero() && DemandedRHS1.isZero() &&
+ !DemandedLHS0.intersects(DemandedLHS1)) {
+ // Use the permute demanded elts masks as the new blend mask.
+ uint64_t NewBlendMask = DemandedLHS1.getZExtValue();
+
+ // Create the new permute mask as a blend of the 2 original permute masks.
+ SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
+ for (int I = 0; I != NumElts; ++I) {
+ if (Demanded0[I])
+ NewPermuteMask[I] = ScaledMask0[I];
+ else if (Demanded1[I])
+ NewPermuteMask[I] = ScaledMask1[I];
+ }
+ assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
+
+ SDLoc DL(Op);
+ SDValue NewBlend = TLO.DAG.getNode(
+ X86ISD::BLENDI, DL, VT, TLO.DAG.getBitcast(VT, Ops0[0]),
+ TLO.DAG.getBitcast(VT, Ops1[0]),
+ TLO.DAG.getTargetConstant(NewBlendMask, DL, MVT::i8));
+ return TLO.CombineTo(Op, TLO.DAG.getVectorShuffle(VT, DL, NewBlend,
+ TLO.DAG.getUNDEF(VT),
+ NewPermuteMask));
+ }
+ break;
+ }
case X86ISD::BLENDV: {
APInt SelUndef, SelZero;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index a026757a0264d6..5fe1e2996ee9b0 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -679,9 +679,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX1-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
@@ -704,9 +703,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX1-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
@@ -727,9 +725,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
@@ -752,9 +749,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 01056a8b2c24a9..44f965d1c6e84e 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1450,19 +1450,17 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3]
; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,2,2]
-; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3]
-; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm10[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,3]
-; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
-; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5],xmm10[6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,2]
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,2]
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7]
; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,3]
; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3],xmm9[4,5,6,7]
; SSE42-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,3]
; SSE42-NEXT: movups %xmm5, 16(%rsi)
; SSE42-NEXT: movups %xmm4, (%rsi)
-; SSE42-NEXT: movdqu %xmm10, 16(%rdx)
+; SSE42-NEXT: movdqu %xmm8, 16(%rdx)
; SSE42-NEXT: movdqu %xmm6, (%rdx)
; SSE42-NEXT: movups %xmm9, 16(%rcx)
; SSE42-NEXT: movups %xmm7, (%rcx)
@@ -1504,19 +1502,14 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm3
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
-; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -1534,26 +1527,18 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-FAST-ALL-NEXT: vmovups (%rdi), %ymm0
; AVX2-FAST-ALL-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-FAST-ALL-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm3, %ymm3
-; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-FAST-ALL-NEXT: vpermps %ymm3, %ymm4, %ymm3
+; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm4
-; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
-; AVX2-FAST-ALL-NEXT: vpermps %ymm5, %ymm6, %ymm5
-; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
-; AVX2-FAST-ALL-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
+; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
+; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,1,4,7]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; AVX2-FAST-ALL-NEXT: vmovups %ymm3, (%rsi)
; AVX2-FAST-ALL-NEXT: vmovups %ymm4, (%rdx)
; AVX2-FAST-ALL-NEXT: vmovups %ymm0, (%rcx)
@@ -1565,19 +1550,14 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-FAST-PERLANE-NEXT: vmovups (%rdi), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm3
+; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
-; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll
index 23de746ecb3539..bce060c4c6f56a 100644
--- a/llvm/test/CodeGen/X86/pr34592.ll
+++ b/llvm/test/CodeGen/X86/pr34592.ll
@@ -8,38 +8,40 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
; CHECK-O0-NEXT: pushq %rbp
; CHECK-O0-NEXT: movq %rsp, %rbp
; CHECK-O0-NEXT: andq $-32, %rsp
-; CHECK-O0-NEXT: subq $32, %rsp
+; CHECK-O0-NEXT: subq $64, %rsp
; CHECK-O0-NEXT: vmovaps %ymm4, %ymm10
; CHECK-O0-NEXT: vmovaps %ymm3, %ymm9
+; CHECK-O0-NEXT: vmovaps %ymm2, (%rsp) # 32-byte Spill
; CHECK-O0-NEXT: vmovaps %ymm1, %ymm8
+; CHECK-O0-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-O0-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload
; CHECK-O0-NEXT: vmovaps 240(%rbp), %ymm4
-; CHECK-O0-NEXT: vmovaps 208(%rbp), %ymm3
-; CHECK-O0-NEXT: vmovaps 176(%rbp), %ymm1
-; CHECK-O0-NEXT: vmovaps 144(%rbp), %ymm1
+; CHECK-O0-NEXT: vmovaps 208(%rbp), %ymm1
+; CHECK-O0-NEXT: vmovaps 176(%rbp), %ymm2
+; CHECK-O0-NEXT: vmovaps 144(%rbp), %ymm2
; CHECK-O0-NEXT: vmovaps 112(%rbp), %ymm11
; CHECK-O0-NEXT: vmovaps 80(%rbp), %ymm11
; CHECK-O0-NEXT: vmovaps 48(%rbp), %ymm11
; CHECK-O0-NEXT: vmovaps 16(%rbp), %ymm11
-; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
-; CHECK-O0-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; CHECK-O0-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,1]
-; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
-; CHECK-O0-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1]
-; CHECK-O0-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
+; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7]
+; CHECK-O0-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm6[0,1]
+; CHECK-O0-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; CHECK-O0-NEXT: vmovaps %xmm1, %xmm3
+; CHECK-O0-NEXT: vmovaps %xmm7, %xmm1
+; CHECK-O0-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; CHECK-O0-NEXT: # implicit-def: $ymm1
+; CHECK-O0-NEXT: vmovaps %xmm3, %xmm1
+; CHECK-O0-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,3]
+; CHECK-O0-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
+; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3,4,5],ymm1[6,7]
; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
-; CHECK-O0-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; CHECK-O0-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1]
-; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5],ymm1[6,7]
-; CHECK-O0-NEXT: vmovaps %xmm3, %xmm4
-; CHECK-O0-NEXT: vmovaps %xmm7, %xmm3
-; CHECK-O0-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3]
-; CHECK-O0-NEXT: # implicit-def: $ymm3
-; CHECK-O0-NEXT: vmovaps %xmm4, %xmm3
-; CHECK-O0-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,0,1,3]
-; CHECK-O0-NEXT: vpslldq {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
-; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5],ymm3[6,7]
+; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7]
+; CHECK-O0-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3]
; CHECK-O0-NEXT: movq %rbp, %rsp
; CHECK-O0-NEXT: popq %rbp
; CHECK-O0-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index a0ea6ddeca7dfd..afdeebc45ed0ab 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -519,19 +519,14 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vmovaps (%rdi), %ymm0
; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
; AVX2-NEXT: vmovaps 64(%rdi), %ymm2
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-NEXT: vpermps %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
; AVX2-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/90219
More information about the llvm-commits
mailing list