[llvm] r240326 - [X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD.

Mon Jun 22 13:51:51 PDT 2015

Author: ab
Date: Mon Jun 22 15:51:51 2015
New Revision: 240326

URL: http://llvm.org/viewvc/llvm-project?rev=240326&view=rev
Log:
[X86] Teach load folding to accept scalar _Int users of MOVSS/MOVSD.

The _Int instructions are special, in that they operate on the full
VR128 instead of FR32.  The load folding then looks at MOVSS, at the
user, and bails out when it sees a size mismatch.

What we really know is that the rm_Int instructions don't load the
higher lanes, so folding is fine.

This happens for the straightforward intrinsic code, e.g.:

    _mm_add_ss(a, _mm_load_ss(p));

Fixes PR23349.

Differential Revision: http://reviews.llvm.org/D10554

Added:
    llvm/trunk/test/CodeGen/X86/fold-load-binops.ll
Modified:
    llvm/trunk/lib/Target/X86/X86InstrInfo.cpp

Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=240326&r1=240325&r2=240326&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Mon Jun 22 15:51:51 2015
@@ -5295,21 +5295,57 @@ MachineInstr *X86InstrInfo::foldMemoryOp
                                Size, Alignment, /*AllowCommute=*/true);
 }
 
-static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
-                                  const MachineFunction &MF) {
+/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
+/// because the latter uses contents that wouldn't be defined in the folded
+/// version.  For instance, this transformation isn't legal:
+///   movss (%rdi), %xmm0
+///   addps %xmm0, %xmm0
+/// ->
+///   addps (%rdi), %xmm0
+///
+/// But this one is:
+///   movss (%rdi), %xmm0
+///   addss %xmm0, %xmm0
+/// ->
+///   addss (%rdi), %xmm0
+///
+static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
+                                             const MachineInstr &UserMI,
+                                             const MachineFunction &MF) {
   unsigned Opc = LoadMI.getOpcode();
+  unsigned UserOpc = UserMI.getOpcode();
   unsigned RegSize =
       MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
 
-  if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4)
+  if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) {
     // These instructions only load 32 bits, we can't fold them if the
-    // destination register is wider than 32 bits (4 bytes).
-    return true;
+    // destination register is wider than 32 bits (4 bytes), and its user
+    // instruction isn't scalar (SS).
+    switch (UserOpc) {
+    case X86::ADDSSrr_Int: case X86::VADDSSrr_Int:
+    case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int:
+    case X86::MULSSrr_Int: case X86::VMULSSrr_Int:
+    case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int:
+      return false;
+    default:
+      return true;
+    }
+  }
 
-  if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8)
+  if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) {
     // These instructions only load 64 bits, we can't fold them if the
-    // destination register is wider than 64 bits (8 bytes).
-    return true;
+    // destination register is wider than 64 bits (8 bytes), and its user
+    // instruction isn't scalar (SD).
+    switch (UserOpc) {
+    case X86::ADDSDrr_Int: case X86::VADDSDrr_Int:
+    case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int:
+    case X86::MULSDrr_Int: case X86::VMULSDrr_Int:
+    case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int:
+      return false;
+    default:
+      return true;
+    }
+  }
 
   return false;
 }
@@ -5321,7 +5357,7 @@ MachineInstr *X86InstrInfo::foldMemoryOp
   unsigned NumOps = LoadMI->getDesc().getNumOperands();
   int FrameIndex;
   if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
-    if (isPartialRegisterLoad(*LoadMI, MF))
+    if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
       return nullptr;
     return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex);
   }
@@ -5434,7 +5470,7 @@ MachineInstr *X86InstrInfo::foldMemoryOp
     break;
   }
   default: {
-    if (isPartialRegisterLoad(*LoadMI, MF))
+    if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF))
       return nullptr;
 
     // Folding a normal load. Just copy the load's address operands.

Added: llvm/trunk/test/CodeGen/X86/fold-load-binops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fold-load-binops.ll?rev=240326&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fold-load-binops.ll (added)
+++ llvm/trunk/test/CodeGen/X86/fold-load-binops.ll Mon Jun 22 15:51:51 2015
@@ -0,0 +1,142 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
+
+; Verify that we're folding the load into the math instruction.
+; This pattern is generated out of the simplest intrinsics usage:
+;  _mm_add_ss(a, _mm_load_ss(b));
+
+define <4 x float> @addss(<4 x float> %va, float* %pb) {
+; SSE-LABEL: addss:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: addss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %a = extractelement <4 x float> %va, i32 0
+    %b = load float, float* %pb
+    %r = fadd float %a, %b
+    %vr = insertelement <4 x float> %va, float %r, i32 0
+    ret <4 x float> %vr
+}
+
+define <2 x double> @addsd(<2 x double> %va, double* %pb) {
+; SSE-LABEL: addsd:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: addsd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vaddsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %a = extractelement <2 x double> %va, i32 0
+    %b = load double, double* %pb
+    %r = fadd double %a, %b
+    %vr = insertelement <2 x double> %va, double %r, i32 0
+    ret <2 x double> %vr
+}
+
+define <4 x float> @subss(<4 x float> %va, float* %pb) {
+; SSE-LABEL: subss:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: subss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %a = extractelement <4 x float> %va, i32 0
+    %b = load float, float* %pb
+    %r = fsub float %a, %b
+    %vr = insertelement <4 x float> %va, float %r, i32 0
+    ret <4 x float> %vr
+}
+
+define <2 x double> @subsd(<2 x double> %va, double* %pb) {
+; SSE-LABEL: subsd:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: subsd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vsubsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %a = extractelement <2 x double> %va, i32 0
+    %b = load double, double* %pb
+    %r = fsub double %a, %b
+    %vr = insertelement <2 x double> %va, double %r, i32 0
+    ret <2 x double> %vr
+}
+
+define <4 x float> @mulss(<4 x float> %va, float* %pb) {
+; SSE-LABEL: mulss:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: mulss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %a = extractelement <4 x float> %va, i32 0
+    %b = load float, float* %pb
+    %r = fmul float %a, %b
+    %vr = insertelement <4 x float> %va, float %r, i32 0
+    ret <4 x float> %vr
+}
+
+define <2 x double> @mulsd(<2 x double> %va, double* %pb) {
+; SSE-LABEL: mulsd:
+; SSE:       # BB#0:
+; SSE-NEXT:    mulsd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: mulsd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmulsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %a = extractelement <2 x double> %va, i32 0
+    %b = load double, double* %pb
+    %r = fmul double %a, %b
+    %vr = insertelement <2 x double> %va, double %r, i32 0
+    ret <2 x double> %vr
+}
+
+define <4 x float> @divss(<4 x float> %va, float* %pb) {
+; SSE-LABEL: divss:
+; SSE:       # BB#0:
+; SSE-NEXT:    divss (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: divss:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %a = extractelement <4 x float> %va, i32 0
+    %b = load float, float* %pb
+    %r = fdiv float %a, %b
+    %vr = insertelement <4 x float> %va, float %r, i32 0
+    ret <4 x float> %vr
+}
+
+define <2 x double> @divsd(<2 x double> %va, double* %pb) {
+; SSE-LABEL: divsd:
+; SSE:       # BB#0:
+; SSE-NEXT:    divsd (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: divsd:
+; AVX:       # BB#0:
+; AVX-NEXT:    vdivsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
+    %a = extractelement <2 x double> %va, i32 0
+    %b = load double, double* %pb
+    %r = fdiv double %a, %b
+    %vr = insertelement <2 x double> %va, double %r, i32 0
+    ret <2 x double> %vr
+}