[llvm] f9bf647 - [X86] combineX86ShufflesRecursively - peek through free extract_subvector nodes

Sun Feb 5 12:19:41 PST 2023

Author: Simon Pilgrim
Date: 2023-02-05T20:16:32Z
New Revision: f9bf64798c1aee3d9f1ce754b2d625026e29d197

URL: https://github.com/llvm/llvm-project/commit/f9bf64798c1aee3d9f1ce754b2d625026e29d197
DIFF: https://github.com/llvm/llvm-project/commit/f9bf64798c1aee3d9f1ce754b2d625026e29d197.diff

LOG: [X86] combineX86ShufflesRecursively - peek through free extract_subvector nodes

If we're shuffling an op that is extracted from the lowest subvector of a larger vector then we should try to peek through as much as possible without exceeding the root size

Another step towards removing the widenSubVector call from combineX86ShufflesRecursively (Issue #45319)

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7c9356254880a..658612953fe57 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40299,6 +40299,13 @@ static SDValue combineX86ShufflesRecursively(
     }
   }
 
+  // Peek through any free extract_subvector nodes back to root size.
+  for (SDValue &Op : Ops)
+    while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+           (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
+           isNullConstant(Op.getOperand(1)))
+      Op = Op.getOperand(0);
+
   // Remove unused/repeated shuffle source ops.
   resolveTargetShuffleInputsAndMask(Ops, Mask);
 

diff  --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index a45c1922e8f4d..e0de26495e055 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -58,25 +58,24 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
 ; AVX1-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm6
-; AVX1-ONLY-NEXT:    vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
+; AVX1-ONLY-NEXT:    vmovsd {{.*#+}} xmm5 = mem[0],zero
+; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; AVX1-ONLY-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX1-ONLY-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX1-ONLY-NEXT:    vshufps {{.*#+}} ymm7 = ymm1[3,0],ymm0[1,0],ymm1[7,4],ymm0[5,4]
-; AVX1-ONLY-NEXT:    vshufps {{.*#+}} ymm7 = ymm7[0,2],ymm0[2,1],ymm7[4,6],ymm0[6,5]
-; AVX1-ONLY-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX1-ONLY-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[3,3]
-; AVX1-ONLY-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3]
+; AVX1-ONLY-NEXT:    vshufps {{.*#+}} ymm5 = ymm1[3,0],ymm0[1,0],ymm1[7,4],ymm0[5,4]
+; AVX1-ONLY-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[0,2],ymm0[2,1],ymm5[4,6],ymm0[6,5]
+; AVX1-ONLY-NEXT:    vpalignr {{.*#+}} xmm6 = xmm2[12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX1-ONLY-NEXT:    vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[3,3]
+; AVX1-ONLY-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3]
 ; AVX1-ONLY-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,3],ymm1[4,6],ymm0[4,7]
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm1
 ; AVX1-ONLY-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[u,u,0,2,u,u,u,5]
-; AVX1-ONLY-NEXT:    vshufps {{.*#+}} ymm1 = ymm6[0,2],ymm1[2,3],ymm6[4,6],ymm1[6,7]
+; AVX1-ONLY-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7]
 ; AVX1-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7]
 ; AVX1-ONLY-NEXT:    vmovaps %ymm0, (%rax)
-; AVX1-ONLY-NEXT:    vextractf128 $1, %ymm7, %xmm0
+; AVX1-ONLY-NEXT:    vextractf128 $1, %ymm5, %xmm0
 ; AVX1-ONLY-NEXT:    vmovlps %xmm0, 48(%rax)
-; AVX1-ONLY-NEXT:    vmovaps %xmm2, 32(%rax)
+; AVX1-ONLY-NEXT:    vmovaps %xmm6, 32(%rax)
 ; AVX1-ONLY-NEXT:    vzeroupper
 ; AVX1-ONLY-NEXT:    retq
 ;