[llvm] 4aaf301 - [DAG] Fold vector (aext (load x)) -> (zext (truncate (zextload x)))

Wed Aug 5 03:22:46 PDT 2020

Author: Simon Pilgrim
Date: 2020-08-05T11:22:23+01:00
New Revision: 4aaf301fb8f4f666d935fe78ef5d74592eee5fc1

URL: https://github.com/llvm/llvm-project/commit/4aaf301fb8f4f666d935fe78ef5d74592eee5fc1
DIFF: https://github.com/llvm/llvm-project/commit/4aaf301fb8f4f666d935fe78ef5d74592eee5fc1.diff

LOG: [DAG] Fold vector (aext (load x)) -> (zext (truncate (zextload x)))

We currently don't do anything to fold any_extend vector loads as no target has such an instruction.

Instead I've added support for folding to a zextload, SimplifyDemandedBits does a good job of adjusting the zext(truncate(()) stages as required later on.

We still need the custom scalar extload handling instead of using the tryToFoldExtOfLoad helper as it has different legality tests - we can probably tweak that to reduce most of the code duplication.

Fixes the regression I mentioned in rG99a971cadff7

Differential Revision: https://reviews.llvm.org/D85129

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/X86/avx512-any_extend_load.ll
    llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
    llvm/test/CodeGen/X86/vector-mul.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b22d978d3736..b2077f47d4a3 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10606,22 +10606,26 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
 
   // fold (aext (load x)) -> (aext (truncate (extload x)))
   // None of the supported targets knows how to perform load and any_ext
-  // on vectors in one instruction.  We only perform this transformation on
-  // scalars.
-  if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
-      ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
+  // on vectors in one instruction, so attempt to fold to zext instead.
+  if (VT.isVector()) {
+    // Try to simplify (zext (load x)).
+    if (SDValue foldedExt =
+            tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
+                               ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
+      return foldedExt;
+  } else if (ISD::isNON_EXTLoad(N0.getNode()) &&
+             ISD::isUNINDEXEDLoad(N0.getNode()) &&
+             TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
     bool DoXform = true;
-    SmallVector<SDNode*, 4> SetCCs;
+    SmallVector<SDNode *, 4> SetCCs;
     if (!N0.hasOneUse())
-      DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs,
-                                        TLI);
+      DoXform =
+          ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
-                                       LN0->getChain(),
-                                       LN0->getBasePtr(), N0.getValueType(),
-                                       LN0->getMemOperand());
+                                       LN0->getChain(), LN0->getBasePtr(),
+                                       N0.getValueType(), LN0->getMemOperand());
       ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND);
       // If the load value is used only by N, replace it via CombineTo N.
       bool NoReplaceTrunc = N0.hasOneUse();
@@ -10630,8 +10634,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
         DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
         recursivelyDeleteUnusedNodes(LN0);
       } else {
-        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
-                                    N0.getValueType(), ExtLoad);
+        SDValue Trunc =
+            DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad);
         CombineTo(LN0, Trunc, ExtLoad.getValue(1));
       }
       return SDValue(N, 0); // Return N so it doesn't get rechecked!

diff  --git a/llvm/test/CodeGen/X86/avx512-any_extend_load.ll b/llvm/test/CodeGen/X86/avx512-any_extend_load.ll
index 99ebd4baffcc..db5f8c244eee 100644
--- a/llvm/test/CodeGen/X86/avx512-any_extend_load.ll
+++ b/llvm/test/CodeGen/X86/avx512-any_extend_load.ll
@@ -4,25 +4,13 @@
 
 
 define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
-; KNL-LABEL: any_extend_load_v8i64:
-; KNL:       # %bb.0:
-; KNL-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; KNL-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; KNL-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; KNL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; KNL-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; KNL-NEXT:    vpmovqb %zmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: any_extend_load_v8i64:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; SKX-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; SKX-NEXT:    vpmovqb %zmm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
+; ALL-LABEL: any_extend_load_v8i64:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; ALL-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; ALL-NEXT:    vpmovqb %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
   %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
   %1 = zext <8 x i8> %wide.load to <8 x i64>
   %2 = add nuw nsw <8 x i64> %1, <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>

diff  --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 050269c2ba2f..c445a522bf2f 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -4368,9 +4368,8 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x doub
 define void @test_zext_v8i8_to_v8i16(<8 x i8>* %arg, <8 x i16>* %arg1) {
 ; CHECK-LABEL: test_zext_v8i8_to_v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovq (%rdi), %xmm0 # xmm0 = mem[0],zero
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; CHECK-NEXT:    vpmovzxbw (%rdi), %xmm0 # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT:    vpsllw $8, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqa %xmm0, (%rsi)
 ; CHECK-NEXT:    retq
   %tmp = getelementptr <8 x i8>, <8 x i8>* %arg, i32 0

diff  --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index d93ac61592f2..805ff9f69ed5 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -1904,8 +1904,7 @@ define <2 x i64> @mul_v2i64_zext_cross_bb(<2 x i32>* %in, <2 x i32>* %y) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; X86-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; X86-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; X86-NEXT:    pmuludq %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;