[llvm] r232773 - [X86, AVX] use blends instead of insert128 with index 0

Thu Mar 19 15:29:41 PDT 2015

Author: spatel
Date: Thu Mar 19 17:29:40 2015
New Revision: 232773

URL: http://llvm.org/viewvc/llvm-project?rev=232773&view=rev
Log:
[X86, AVX] use blends instead of insert128 with index 0

Another case of x86-specific shuffle strength reduction:
avoid generating insert*128 instructions with index 0 because
they are slower than their non-lane-changing blend equivalents.

Shuffle lowering already catches most of these cases, but
the zero vector case and some other paths such as in the
modified test in vector-shuffle-256-v32.ll were getting
through.

Differential Revision: http://reviews.llvm.org/D8366

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/2012-04-26-sdglue.ll
    llvm/trunk/test/CodeGen/X86/avx-cast.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=232773&r1=232772&r2=232773&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Mar 19 17:29:40 2015
@@ -160,8 +160,51 @@ static SDValue InsertSubVector(SDValue R
 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 /// lowering INSERT_VECTOR_ELT operations easier.
 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
-                                  SelectionDAG &DAG,SDLoc dl) {
+                                  SelectionDAG &DAG, SDLoc dl) {
   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+
+  // For insertion into the zero index (low half) of a 256-bit vector, it is
+  // more efficient to generate a blend with immediate instead of an insert*128.
+  // We are still creating an INSERT_SUBVECTOR below with an undef node to
+  // extend the subvector to the size of the result vector. Make sure that
+  // we are not recursing on that node by checking for undef here.
+  if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
+      Result.getOpcode() != ISD::UNDEF) {
+    EVT ResultVT = Result.getValueType();
+    SDValue ZeroIndex = DAG.getIntPtrConstant(0);
+    SDValue Undef = DAG.getUNDEF(ResultVT);
+    SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
+                               Vec, ZeroIndex);
+
+    // The blend instruction, and therefore its mask, depend on the data type.
+    MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
+    if (ScalarType.isFloatingPoint()) {
+      // Choose either vblendps (float) or vblendpd (double).
+      unsigned ScalarSize = ScalarType.getSizeInBits();
+      assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
+      unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
+      SDValue Mask = DAG.getConstant(MaskVal, MVT::i8);
+      return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
+    }
+    
+    const X86Subtarget &Subtarget =
+      static_cast<const X86Subtarget &>(DAG.getSubtarget());
+
+    // AVX2 is needed for 256-bit integer blend support.
+    // Integers must be cast to 32-bit because there is only vpblendd;
+    // vpblendw can't be used for this because it has a handicapped mask.
+
+    // If we don't have AVX2, then cast to float. Using a wrong domain blend
+    // is still more efficient than using the wrong domain vinsertf128 that
+    // will be created by InsertSubVector().
+    MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
+
+    SDValue Mask = DAG.getConstant(0x0f, MVT::i8);
+    Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256);
+    Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
+    return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256);
+  }
+
   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 }
 

Modified: llvm/trunk/test/CodeGen/X86/2012-04-26-sdglue.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/2012-04-26-sdglue.ll?rev=232773&r1=232772&r2=232773&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/2012-04-26-sdglue.ll (original)
+++ llvm/trunk/test/CodeGen/X86/2012-04-26-sdglue.ll Thu Mar 19 17:29:40 2015
@@ -5,10 +5,10 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK-LABEL: func:
-;CHECK: vpxor
-;CHECK: vinserti128
+;CHECK: vxorps
 ;CHECK: vpshufd
 ;CHECK: vpbroadcastd
+;CHECK: vinserti128
 ;CHECK: vmulps
 ;CHECK: vmulps
 ;CHECK: ret

Modified: llvm/trunk/test/CodeGen/X86/avx-cast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-cast.ll?rev=232773&r1=232772&r2=232773&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-cast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-cast.ll Thu Mar 19 17:29:40 2015
@@ -1,51 +1,100 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx  | FileCheck %s --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+
+; Prefer a blend instruction to a vinsert128 instruction because blends
+; are simpler (no lane changes) and therefore will have equal or better
+; performance.
 
-; CHECK-LABEL: castA:
-; CHECK: vxorps
-; CHECK-NEXT: vinsertf128 $0
 define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castA:
+; AVX1:         vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: castA:
+; AVX2:         vxorps %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+
 entry:
   %shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
   ret <8 x float> %shuffle.i
 }
 
-; CHECK-LABEL: castB:
-; CHECK: vxorps
-; CHECK-NEXT: vinsertf128 $0
 define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castB:
+; AVX1:         vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: castB:
+; AVX2:         vxorpd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT:    retq
+
 entry:
   %shuffle.i = shufflevector <2 x double> %m, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
   ret <4 x double> %shuffle.i
 }
 
-; CHECK-LABEL: castC:
-; CHECK: vxorps
-; CHECK-NEXT: vinsertf128 $0
+; AVX2 is needed for integer types.
+
 define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castC:
+; AVX1:         vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: castC:
+; AVX2:         vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+
 entry:
   %shuffle.i = shufflevector <2 x i64> %m, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
   ret <4 x i64> %shuffle.i
 }
 
-; CHECK-LABEL: castD:
-; CHECK-NOT: vextractf128 $0
+; The next three tests don't need any shuffling. There may or may not be a
+; vzeroupper before the return, so just check for the absence of shuffles.
+
 define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castD:
+; AVX1-NOT:    extract
+; AVX1-NOT:    blend
+;
+; AVX2-LABEL: castD:
+; AVX2-NOT:    extract
+; AVX2-NOT:    blend
+
 entry:
   %shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %shuffle.i
 }
 
-; CHECK-LABEL: castE:
-; CHECK-NOT: vextractf128 $0
 define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castE:
+; AVX1-NOT:    extract
+; AVX1-NOT:    blend
+;
+; AVX2-LABEL: castE:
+; AVX2-NOT:    extract
+; AVX2-NOT:    blend
+
 entry:
   %shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> <i32 0, i32 1>
   ret <2 x i64> %shuffle.i
 }
 
-; CHECK-LABEL: castF:
-; CHECK-NOT: vextractf128 $0
 define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castF:
+; AVX1-NOT:    extract
+; AVX1-NOT:    blend
+;
+; AVX2-LABEL: castF:
+; AVX2-NOT:    extract
+; AVX2-NOT:    blend
+
 entry:
   %shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %shuffle.i

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll?rev=232773&r1=232772&r2=232773&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll Thu Mar 19 17:29:40 2015
@@ -652,12 +652,12 @@ define <32 x i8> @shuffle_v32i8_31_00_00
 ;
 ; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX2-NEXT:    movl $15, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vinserti128 $0, %xmm1, %ymm2, %ymm1
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd $15, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>