[llvm] 03482bc - [X86] collectConcatOps - add ability to collect from vector 'widening' patterns

Wed Apr 27 07:39:10 PDT 2022

Author: Simon Pilgrim
Date: 2022-04-27T15:38:58+01:00
New Revision: 03482bccad33274c74b385fc139bbb25be156543

URL: https://github.com/llvm/llvm-project/commit/03482bccad33274c74b385fc139bbb25be156543
DIFF: https://github.com/llvm/llvm-project/commit/03482bccad33274c74b385fc139bbb25be156543.diff

LOG: [X86] collectConcatOps - add ability to collect from vector 'widening' patterns

Recognise insert_subvector(undef, x, lo/hi) patterns where we double the width of a vector - creating an UNDEF subvector on the fly.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/oddshuffles.ll
    llvm/test/CodeGen/X86/oddsubvector.ll
    llvm/test/CodeGen/X86/pr11334.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8cc6a28fadacc..539d2f25a0cf2 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6322,7 +6322,8 @@ static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
 // Helper function to collect subvector ops that are concatenated together,
 // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
 // The subvectors in Ops are guaranteed to be the same type.
-static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
+static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
+                             SelectionDAG &DAG) {
   assert(Ops.empty() && "Expected an empty ops vector");
 
   if (N->getOpcode() == ISD::CONCAT_VECTORS) {
@@ -6338,21 +6339,34 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
     EVT SubVT = Sub.getValueType();
 
     // TODO - Handle more general insert_subvector chains.
-    if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
-        Idx == (VT.getVectorNumElements() / 2)) {
-      // insert_subvector(insert_subvector(undef, x, lo), y, hi)
-      if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
-          Src.getOperand(1).getValueType() == SubVT &&
-          isNullConstant(Src.getOperand(2))) {
-        Ops.push_back(Src.getOperand(1));
+    if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
+      // insert_subvector(undef, x, lo)
+      if (Idx == 0 && Src.isUndef()) {
         Ops.push_back(Sub);
+        Ops.push_back(DAG.getUNDEF(SubVT));
         return true;
       }
-      // insert_subvector(x, extract_subvector(x, lo), hi)
-      if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-          Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
-        Ops.append(2, Sub);
-        return true;
+      if (Idx == (VT.getVectorNumElements() / 2)) {
+        // insert_subvector(insert_subvector(undef, x, lo), y, hi)
+        if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+            Src.getOperand(1).getValueType() == SubVT &&
+            isNullConstant(Src.getOperand(2))) {
+          Ops.push_back(Src.getOperand(1));
+          Ops.push_back(Sub);
+          return true;
+        }
+        // insert_subvector(x, extract_subvector(x, lo), hi)
+        if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+            Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
+          Ops.append(2, Sub);
+          return true;
+        }
+        // insert_subvector(undef, x, hi)
+        if (Src.isUndef()) {
+          Ops.push_back(DAG.getUNDEF(SubVT));
+          Ops.push_back(Sub);
+          return true;
+        }
       }
     }
   }
@@ -6811,7 +6825,7 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
     }
   }
   SmallVector<SDValue, 2> CatOps;
-  if (collectConcatOps(V.getNode(), CatOps)) {
+  if (collectConcatOps(V.getNode(), CatOps, DAG)) {
     for (SDValue &CatOp : CatOps) {
       SDValue NotCat = IsNOT(CatOp, DAG);
       if (!NotCat) return SDValue();
@@ -25278,7 +25292,8 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
       ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
        !Subtarget.hasBWI())) {
     SmallVector<SDValue, 4> CatOps;
-    if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
+    if (StoredVal.hasOneUse() &&
+        collectConcatOps(StoredVal.getNode(), CatOps, DAG))
       return splitVectorStore(St, DAG);
     return SDValue();
   }
@@ -39744,7 +39759,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
         return SDValue();
       SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
       SmallVector<SDValue> SubOps;
-      if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
+      if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
         return SubOps[Idx & 1];
       unsigned NumElts = Src.getValueType().getVectorNumElements();
       if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
@@ -43724,8 +43739,8 @@ static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
   SDValue FVal = N->getOperand(2);
   SmallVector<SDValue, 4> CatOpsT, CatOpsF;
   if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
-      !collectConcatOps(TVal.getNode(), CatOpsT) ||
-      !collectConcatOps(FVal.getNode(), CatOpsF))
+      !collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||
+      !collectConcatOps(FVal.getNode(), CatOpsF, DAG))
     return SDValue();
 
   auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
@@ -45048,7 +45063,7 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
   // MOVMSK(CONCAT(X,Y)) != -1 ->  MOVMSK(AND(X,Y)).
   if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
     SmallVector<SDValue> Ops;
-    if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) &&
+    if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
         Ops.size() == 2) {
       SDLoc DL(EFLAGS);
       EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
@@ -49683,7 +49698,8 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
     // PACK should still be worth it for 128-bit vectors if the sources were
     // originally concatenated from subvectors.
     SmallVector<SDValue> ConcatOps;
-    if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
+    if (VT.getSizeInBits() > 128 ||
+        !collectConcatOps(In.getNode(), ConcatOps, DAG))
       return SDValue();
   }
 
@@ -53607,7 +53623,7 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
 
   // Match concat_vector style patterns.
   SmallVector<SDValue, 2> SubVectorOps;
-  if (collectConcatOps(N, SubVectorOps)) {
+  if (collectConcatOps(N, SubVectorOps, DAG)) {
     if (SDValue Fold =
             combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
       return Fold;
@@ -53669,7 +53685,7 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
   SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
   SmallVector<SDValue, 4> CatOps;
   if (Sel.getOpcode() != ISD::VSELECT ||
-      !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
+      !collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))
     return SDValue();
 
   // Note: We assume simple value types because this should only be called with

diff  --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 34f6d9ffb6799..be99de22eeb30 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -2430,14 +2430,11 @@ define void @D107009(<64 x i32>* %input, <64 x i32>* %output) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
 ; AVX1-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    vmovups %ymm0, (%rsi)
+; AVX1-NEXT:    vmovdqa %xmm0, 16(%rsi)
+; AVX1-NEXT:    vmovdqa %xmm7, 112(%rsi)
+; AVX1-NEXT:    vmovdqa %xmm6, 48(%rsi)
 ; AVX1-NEXT:    vmovups %ymm1, 128(%rsi)
-; AVX1-NEXT:    vmovups %ymm7, 96(%rsi)
-; AVX1-NEXT:    vmovups %ymm6, 32(%rsi)
 ; AVX1-NEXT:    vmovupd %ymm5, 192(%rsi)
 ; AVX1-NEXT:    vmovups %ymm4, 224(%rsi)
 ; AVX1-NEXT:    vmovups %ymm3, 160(%rsi)
@@ -2461,11 +2458,9 @@ define void @D107009(<64 x i32>* %input, <64 x i32>* %output) {
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
 ; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm4
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm0[3,3,3,3,7,7,7,7]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm6 = ymm0[2,3,2,3,6,7,6,7]
@@ -2475,9 +2470,9 @@ define void @D107009(<64 x i32>* %input, <64 x i32>* %output) {
 ; AVX2-NEXT:    vmovdqu %ymm6, 192(%rsi)
 ; AVX2-NEXT:    vmovdqu %ymm5, 224(%rsi)
 ; AVX2-NEXT:    vmovdqu %ymm4, (%rsi)
-; AVX2-NEXT:    vmovdqu %ymm3, 64(%rsi)
-; AVX2-NEXT:    vmovdqu %ymm2, 32(%rsi)
-; AVX2-NEXT:    vmovdqu %ymm1, 96(%rsi)
+; AVX2-NEXT:    vmovdqa %xmm3, 48(%rsi)
+; AVX2-NEXT:    vmovdqa %xmm2, 112(%rsi)
+; AVX2-NEXT:    vmovdqu %ymm1, 64(%rsi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -2507,14 +2502,11 @@ define void @D107009(<64 x i32>* %input, <64 x i32>* %output) {
 ; XOP-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
 ; XOP-NEXT:    vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2]
 ; XOP-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
-; XOP-NEXT:    vinsertf128 $1, %xmm6, %ymm0, %ymm6
 ; XOP-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
-; XOP-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; XOP-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; XOP-NEXT:    vmovups %ymm0, (%rsi)
+; XOP-NEXT:    vmovdqa %xmm0, 16(%rsi)
+; XOP-NEXT:    vmovdqa %xmm7, 112(%rsi)
+; XOP-NEXT:    vmovdqa %xmm6, 48(%rsi)
 ; XOP-NEXT:    vmovups %ymm1, 128(%rsi)
-; XOP-NEXT:    vmovups %ymm7, 96(%rsi)
-; XOP-NEXT:    vmovups %ymm6, 32(%rsi)
 ; XOP-NEXT:    vmovupd %ymm5, 192(%rsi)
 ; XOP-NEXT:    vmovups %ymm4, 224(%rsi)
 ; XOP-NEXT:    vmovups %ymm3, 160(%rsi)

diff  --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index fb4c23e7de1a4..cda2ab5965509 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -91,11 +91,11 @@ define void @PR40815(%struct.Mat4* nocapture readonly dereferenceable(64), %stru
 ;
 ; AVX512-LABEL: PR40815:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps 16(%rdi), %xmm0
-; AVX512-NEXT:    vmovaps 48(%rdi), %xmm1
-; AVX512-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
-; AVX512-NEXT:    vinsertf128 $1, 32(%rdi), %ymm1, %ymm1
-; AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT:    vmovaps 48(%rdi), %xmm0
+; AVX512-NEXT:    vmovups 16(%rdi), %ymm1
+; AVX512-NEXT:    vinsertf128 $1, (%rdi), %ymm1, %ymm1
+; AVX512-NEXT:    vinsertf128 $1, 32(%rdi), %ymm0, %ymm0
+; AVX512-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovups %zmm0, (%rsi)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/pr11334.ll b/llvm/test/CodeGen/X86/pr11334.ll
index 666f72ba06cb7..da3e8a6ad1232 100644
--- a/llvm/test/CodeGen/X86/pr11334.ll
+++ b/llvm/test/CodeGen/X86/pr11334.ll
@@ -94,9 +94,7 @@ define void @test_vector_creation() nounwind {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps %ymm0, (%rax)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovaps %xmm0, (%rax)
 ; AVX-NEXT:    retq
   %1 = insertelement <4 x double> undef, double 0.000000e+00, i32 2
   %2 = load double, double addrspace(1)* null