[llvm-commits] [llvm] r137810 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp lib/Target/X86/X86ISelLowering.h lib/Target/X86/X86InstrFragmentsSIMD.td lib/Target/X86/X86InstrSSE.td test/CodeGen/X86/avx-vbroadcast.ll

Bruno Cardoso Lopes bruno.cardoso at gmail.com
Tue Aug 16 19:29:19 PDT 2011


Author: bruno
Date: Tue Aug 16 21:29:19 2011
New Revision: 137810

URL: http://llvm.org/viewvc/llvm-project?rev=137810&view=rev
Log:
Introduce matching patterns for vbroadcast AVX instruction. The idea is to
match splats in the form (splat (scalar_to_vector (load ...))) whenever
the load can be folded. All the logic and instruction emission is
working but because of PR8156, there are no ways to match loads, cause
they can never be folded for splats. Thus, the tests are XFAILed, but
I've tested and exercised all the logic using a relaxed version for
checking the foldable loads, as if the bug was already fixed. This
should work out of the box once PR8156 gets fixed since MayFoldLoad will
work as expected.

Added:
    llvm/trunk/test/CodeGen/X86/avx-vbroadcast.ll
Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.h
    llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td
    llvm/trunk/lib/Target/X86/X86InstrSSE.td

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=137810&r1=137809&r2=137810&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Aug 16 21:29:19 2011
@@ -6151,6 +6151,48 @@
   return 0;
 }
 
+/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
+/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
+/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
+static bool isVectorBroadcast(SDValue &Op) {
+  EVT VT = Op.getValueType();
+  bool Is256 = VT.getSizeInBits() == 256;
+
+  assert((VT.getSizeInBits() == 128 || Is256) &&
+         "Unsupported type for vbroadcast node");
+
+  SDValue V = Op;
+  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  if (Is256 && !(V.hasOneUse() &&
+                 V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+                 V.getOperand(0).getOpcode() == ISD::UNDEF))
+    return false;
+
+  if (Is256)
+    V = V.getOperand(1);
+  if (V.hasOneUse() && V.getOpcode() != ISD::SCALAR_TO_VECTOR)
+    return false;
+
+  // Check the source scalar_to_vector type. 256-bit broadcasts are
+  // supported for 32/64-bit sizes, while 128-bit ones are only supported
+  // for 32-bit scalars.
+  unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
+  if (ScalarSize != 32 && ScalarSize != 64)
+    return false;
+  if (!Is256 && ScalarSize == 64)
+    return false;
+
+  V = V.getOperand(0);
+  if (!MayFoldLoad(V))
+    return false;
+
+  // Return the load node
+  Op = V;
+  return true;
+}
+
 static
 SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
                                const TargetLowering &TLI,
@@ -6174,6 +6216,10 @@
     if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
       return Op;
 
+    // Use vbroadcast whenever the splat comes from a foldable load
+    if (Subtarget->hasAVX() && isVectorBroadcast(V1))
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
+
     // Handle splats by matching through known shuffle masks
     if (VT.is128BitVector() && NumElem <= 4)
       return SDValue();
@@ -10189,6 +10235,7 @@
   case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
   case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
   case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
+  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   case X86ISD::VPERMILPS:          return "X86ISD::VPERMILPS";
   case X86ISD::VPERMILPSY:         return "X86ISD::VPERMILPSY";
   case X86ISD::VPERMILPD:          return "X86ISD::VPERMILPD";

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=137810&r1=137809&r2=137810&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Tue Aug 16 21:29:19 2011
@@ -276,6 +276,7 @@
       VPERMILPD,
       VPERMILPDY,
       VPERM2F128,
+      VBROADCAST,
 
       // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
       // according to %al. An operator is needed so that this can be expanded

Modified: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td?rev=137810&r1=137809&r2=137810&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td Tue Aug 16 21:29:19 2011
@@ -109,6 +109,8 @@
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                  SDTCisSameAs<0,2>, SDTCisInt<3>]>;
 
+def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+
 def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>;
 
 def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
@@ -160,6 +162,8 @@
 
 def X86VPerm2f128 : SDNode<"X86ISD::VPERM2F128", SDTShuff3OpI>;
 
+def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
 //===----------------------------------------------------------------------===//

Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=137810&r1=137809&r2=137810&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Tue Aug 16 21:29:19 2011
@@ -5474,6 +5474,20 @@
 def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
           (VBROADCASTF128 addr:$src)>;
 
+def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSY addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VBROADCASTSD addr:$src)>;
+def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSSY addr:$src)>;
+def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
+          (VBROADCASTSD addr:$src)>;
+
+def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSS addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSS addr:$src)>;
+
 //===----------------------------------------------------------------------===//
 // VINSERTF128 - Insert packed floating-point values
 //

Added: llvm/trunk/test/CodeGen/X86/avx-vbroadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-vbroadcast.ll?rev=137810&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-vbroadcast.ll (added)
+++ llvm/trunk/test/CodeGen/X86/avx-vbroadcast.ll Tue Aug 16 21:29:19 2011
@@ -0,0 +1,84 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+; XFAIL: *
+
+; xfail this file for now because of PR8156, when it gets solved merge this with avx-splat.ll
+
+; CHECK: vbroadcastsd (%
+define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load i64* %ptr, align 8
+  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
+  %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
+  %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
+  %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
+  ret <4 x i64> %vecinit6.i
+}
+
+; CHECK: vbroadcastss (%
+define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load i32* %ptr, align 4
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
+  %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
+  %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
+  ret <8 x i32> %vecinit6.i
+}
+
+; CHECK: vbroadcastsd (%
+define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load double* %ptr, align 8
+  %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
+  %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
+  %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
+  %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
+  ret <4 x double> %vecinit6.i
+}
+
+; CHECK: vbroadcastss (%
+define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load float* %ptr, align 4
+  %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
+  ret <8 x float> %vecinit6.i
+}
+
+;;;; 128-bit versions
+
+; CHECK: vbroadcastss (%
+define <4 x float> @E(float* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load float* %ptr, align 4
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  ret <4 x float> %vecinit6.i
+}
+
+; CHECK: vbroadcastss (%
+define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load i32* %ptr, align 4
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
+  %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
+  %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
+  ret <4 x i32> %vecinit6.i
+}
+
+; Unsupported vbroadcasts
+
+; CHECK: _G
+; CHECK-NOT: vbroadcastsd (%
+define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load i64* %ptr, align 8
+  %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
+  %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
+  ret <2 x i64> %vecinit2.i
+}





More information about the llvm-commits mailing list