[llvm] r217758 - [x86] Teach the x86 DAG combiner to form MOVSLDUP and MOVSHDUP

Mon Sep 15 04:15:24 PDT 2014

Author: chandlerc
Date: Mon Sep 15 06:15:23 2014
New Revision: 217758

URL: http://llvm.org/viewvc/llvm-project?rev=217758&view=rev
Log:
[x86] Teach the x86 DAG combiner to form MOVSLDUP and MOVSHDUP
instructions when it finds an appropriate pattern.

These are lovely instructions, and its a shame to not use them. =] They
are fast, and can hand loads folded into their operands, etc.

I've also plumbed the comment shuffle decoding through the various
layers so that the test cases are printed nicely.

Modified:
    llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp
    llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp
    llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll

Modified: llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp?rev=217758&r1=217757&r2=217758&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp (original)
+++ llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp Mon Sep 15 06:15:23 2014
@@ -128,6 +128,42 @@ bool llvm::EmitAnyX86InstComments(const
     DecodeMOVHLPSMask(2, ShuffleMask);
     break;
 
+  case X86::MOVSLDUPrr:
+  case X86::VMOVSLDUPrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::MOVSLDUPrm:
+  case X86::VMOVSLDUPrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask);
+    break;
+
+  case X86::VMOVSHDUPYrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VMOVSHDUPYrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask);
+    break;
+
+  case X86::VMOVSLDUPYrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VMOVSLDUPYrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask);
+    break;
+
+  case X86::MOVSHDUPrr:
+  case X86::VMOVSHDUPrr:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::MOVSHDUPrm:
+  case X86::VMOVSHDUPrm:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask);
+    break;
+
   case X86::PALIGNR128rr:
   case X86::VPALIGNR128rr:
     Src1Name = getRegName(MI->getOperand(2).getReg());

Modified: llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp?rev=217758&r1=217757&r2=217758&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp (original)
+++ llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp Mon Sep 15 06:15:23 2014
@@ -63,6 +63,22 @@ void DecodeMOVLHPSMask(unsigned NElts, S
     ShuffleMask.push_back(NElts+i);
 }
 
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  for (int i = 0, e = NumElts / 2; i < e; ++i) {
+    ShuffleMask.push_back(2 * i);
+    ShuffleMask.push_back(2 * i);
+  }
+}
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  for (int i = 0, e = NumElts / 2; i < e; ++i) {
+    ShuffleMask.push_back(2 * i + 1);
+    ShuffleMask.push_back(2 * i + 1);
+  }
+}
+
 void DecodePALIGNRMask(MVT VT, unsigned Imm,
                        SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();

Modified: llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h?rev=217758&r1=217757&r2=217758&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h (original)
+++ llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h Mon Sep 15 06:15:23 2014
@@ -38,6 +38,10 @@ void DecodeMOVHLPSMask(unsigned NElts, S
 // <0,2> or <0,1,4,5>
 void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
 
+void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
 void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
 void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=217758&r1=217757&r2=217758&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Sep 15 06:15:23 2014
@@ -5407,12 +5407,16 @@ static bool getTargetShuffleMask(SDNode
     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     if (Mask.empty()) return false;
     break;
+  case X86ISD::MOVSLDUP:
+    DecodeMOVSLDUPMask(VT, Mask);
+    break;
+  case X86ISD::MOVSHDUP:
+    DecodeMOVSHDUPMask(VT, Mask);
+    break;
   case X86ISD::MOVDDUP:
   case X86ISD::MOVLHPD:
   case X86ISD::MOVLPD:
   case X86ISD::MOVLPS:
-  case X86ISD::MOVSHDUP:
-  case X86ISD::MOVSLDUP:
     // Not yet implemented
     return false;
   default: llvm_unreachable("unknown target shuffle node");
@@ -19364,38 +19368,53 @@ static bool combineX86ShuffleChain(SDVal
   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
   // vectors because it can have a load folded into it that UNPCK cannot. This
   // doesn't preclude something switching to the shorter encoding post-RA.
-  if (FloatDomain && (Mask.equals(0, 0) || Mask.equals(1, 1))) {
-    bool Lo = Mask.equals(0, 0);
-    unsigned Shuffle;
-    MVT ShuffleVT;
-    // Check if we have SSE3 which will let us use MOVDDUP. That instruction
-    // is no slower than UNPCKLPD but has the option to fold the input operand
-    // into even an unaligned memory load.
-    if (Lo && Subtarget->hasSSE3()) {
-      Shuffle = X86ISD::MOVDDUP;
-      ShuffleVT = MVT::v2f64;
-    } else {
-      // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
-      // than the UNPCK variants.
-      Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
-      ShuffleVT = MVT::v4f32;
-    }
-    if (Depth == 1 && Root->getOpcode() == Shuffle)
-      return false; // Nothing to do!
-    Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
-    DCI.AddToWorklist(Op.getNode());
-    if (Shuffle == X86ISD::MOVDDUP)
+  if (FloatDomain) {
+    if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
+      bool Lo = Mask.equals(0, 0);
+      unsigned Shuffle;
+      MVT ShuffleVT;
+      // Check if we have SSE3 which will let us use MOVDDUP. That instruction
+      // is no slower than UNPCKLPD but has the option to fold the input operand
+      // into even an unaligned memory load.
+      if (Lo && Subtarget->hasSSE3()) {
+        Shuffle = X86ISD::MOVDDUP;
+        ShuffleVT = MVT::v2f64;
+      } else {
+        // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
+        // than the UNPCK variants.
+        Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
+        ShuffleVT = MVT::v4f32;
+      }
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
+      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      DCI.AddToWorklist(Op.getNode());
+      if (Shuffle == X86ISD::MOVDDUP)
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+      else
+        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+      DCI.AddToWorklist(Op.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                    /*AddTo*/ true);
+      return true;
+    }
+    if (Subtarget->hasSSE3() &&
+        (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
+      bool Lo = Mask.equals(0, 0, 2, 2);
+      unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
+      MVT ShuffleVT = MVT::v4f32;
+      if (Depth == 1 && Root->getOpcode() == Shuffle)
+        return false; // Nothing to do!
+      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+      DCI.AddToWorklist(Op.getNode());
       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
-    else
-      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
-    DCI.AddToWorklist(Op.getNode());
-    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
-                  /*AddTo*/ true);
-    return true;
+      DCI.AddToWorklist(Op.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                    /*AddTo*/ true);
+      return true;
+    }
   }
 
-  // FIXME: We should match UNPCKLPS and UNPCKHPS here.
-
   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
   // variants as none of these have single-instruction variants that are
   // superior to the UNPCK formulation.

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll?rev=217758&r1=217757&r2=217758&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll Mon Sep 15 06:15:23 2014
@@ -119,6 +119,36 @@ define <4 x float> @shuffle_v4f32_3210(<
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x float> %shuffle
 }
+define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @shuffle_v4f32_0022
+; SSE2:         shufps {{.*}} # xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_0022
+; SSE41:         movsldup {{.*}} # xmm0 = xmm0[0,0,2,2]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_0022
+; AVX1:         vmovsldup {{.*}} # xmm0 = xmm0[0,0,2,2]
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x float> %shuffle
+}
+define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @shuffle_v4f32_1133
+; SSE2:         shufps {{.*}} # xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: @shuffle_v4f32_1133
+; SSE41:         movshdup {{.*}} # xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: @shuffle_v4f32_1133
+; AVX1:         vmovshdup {{.*}} # xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    retq
+  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  ret <4 x float> %shuffle
+}
 
 define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
 ; SSE2-LABEL: @shuffle_v4i32_0124