[llvm] 87e4ad4 - [X86] Remove isel pattern for MMX_X86movdq2q+simple_load. Replace with DAG combine to to loadmmx.

Fri May 29 10:20:21 PDT 2020

Author: Craig Topper
Date: 2020-05-29T10:20:03-07:00
New Revision: 87e4ad4d5ce1a231fae257faaada8badcfc22d43

URL: https://github.com/llvm/llvm-project/commit/87e4ad4d5ce1a231fae257faaada8badcfc22d43
DIFF: https://github.com/llvm/llvm-project/commit/87e4ad4d5ce1a231fae257faaada8badcfc22d43.diff

LOG: [X86] Remove isel pattern for MMX_X86movdq2q+simple_load. Replace with DAG combine to to loadmmx.

Only 64-bit bits will be loaded, not the whole 128 bits. We can
just combine it to plain mmx load. This has the side effect of
enabling isel load folding for it.

This part of my desire to get rid of isel patterns that shrink loads.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86InstrMMX.td
    llvm/test/CodeGen/X86/mmx-fold-load.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8ec958338c02..a1121600346f 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47736,6 +47736,27 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
   return DAG.getBitcast(VT, Cvt);
 }
 
+static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
+  SDValue Src = N->getOperand(0);
+
+  // Turn MOVDQ2Q+simple_load into an mmx load.
+  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+    LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
+
+    if (LN->isSimple()) {
+      SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
+                                  LN->getBasePtr(),
+                                  LN->getPointerInfo(),
+                                  LN->getOriginalAlign(),
+                                  LN->getMemOperand()->getFlags());
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
+      return NewLd;
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -47898,6 +47919,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, Subtarget);
   case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
   case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
+  case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
   }
 
   return SDValue();

diff  --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index 2880be6cb8f3..83eddaa05f4a 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -568,9 +568,6 @@ def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
 def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
           (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
 
-def : Pat<(x86mmx (MMX_X86movdq2q (v2i64 (simple_load addr:$src)))),
-          (x86mmx (MMX_MOVQ64rm addr:$src))>;
-
 def : Pat<(v2i64 (X86vzmovl (scalar_to_vector
                              (i64 (bitconvert (x86mmx VR64:$src)))))),
           (MMX_MOVQ2DQrr VR64:$src)>;

diff  --git a/llvm/test/CodeGen/X86/mmx-fold-load.ll b/llvm/test/CodeGen/X86/mmx-fold-load.ll
index 5ad2d50c1d13..5de16c0ca082 100644
--- a/llvm/test/CodeGen/X86/mmx-fold-load.ll
+++ b/llvm/test/CodeGen/X86/mmx-fold-load.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -disable-peephole -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
 
 define i64 @t0(<1 x i64>* %a, i32* %b) nounwind {
 ; X86-LABEL: t0:
@@ -616,3 +616,30 @@ entry:
 
 declare void @llvm.lifetime.start(i64, i8* nocapture)
 declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+; Make sure we shrink this vector load and fold it.
+define x86_mmx @vec_load(<4 x float>* %x) {
+; X86-LABEL: vec_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pshufw $68, (%eax), %mm0 # mm0 = mem[0,1,0,1]
+; X86-NEXT:    paddsb %mm0, %mm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: vec_load:
+; X64:       # %bb.0:
+; X64-NEXT:    pshufw $68, (%rdi), %mm0 # mm0 = mem[0,1,0,1]
+; X64-NEXT:    paddsb %mm0, %mm0
+; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    retq
+  %z = load <4 x float>, <4 x float>* %x
+  %y = extractelement <4 x float> %z, i32 0
+  %a = insertelement <2 x float> undef, float %y, i32 0
+  %b = insertelement <2 x float> %a, float %y, i32 1
+  %c = bitcast <2 x float> %b to x86_mmx
+  %d = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %c, x86_mmx %c)
+  ret x86_mmx %d
+}
+
+declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
+