[PATCH] D38316: [InstCombine] replace bitcast to scalar + insertelement with widening shuffle + vector bitcast

Wed Sep 27 07:04:27 PDT 2017

spatel created this revision.
Herald added a subscriber: mcrosier.

insert undef, (bitcast vType X to scalar), C --> bitcast (shuffle X, undef, Mask)

I think this is a universal improvement for vector IR code because it removes a vector-to-scalar-to-vector transition, but I'm not sure if the pattern is relevant to anything besides x86 AVX. In the motivating example from PR34716 ( https://bugs.llvm.org/show_bug.cgi?id=34716 ), we have:

  define <8 x i64> @test(i32 %x0, i32 %x1) {
    %1 = insertelement <2 x i32> undef, i32 %x0, i32 0
    %2 = insertelement <2 x i32> %1, i32 %x1, i32 1
    %3 = bitcast <2 x i32> %2 to i64
    %4 = insertelement <8 x i64> undef, i64 %3, i32 0
    %5 = shufflevector <8 x i64> %4, <8 x i64> undef, <8 x i32> zeroinitializer
    ret <8 x i64> %5
  }

This leads to inefficient movement between scalar GPRs and vector registers. With this patch, other vector instcombines will fire reducing the IR to:

  define <8 x i64> @test(i32 %x0, i32 %x1) {
    %1 = insertelement <16 x i32> undef, i32 %x0, i32 0   ; wide vec insert
    %2 = insertelement <16 x i32> %1, i32 %x1, i32 1       ; wide vec insert
    %3 = bitcast <16 x i32> %2 to <8 x i64>                       ; free bitcast
    %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> zeroinitializer   ; splat
    ret <8 x i64> %4
  }

And through backend folds, a 32-bit AVX512 target could manage to load the two 32-bit scalars and splat in one instruction (although this doesn't quite happen yet):

  vmovsd	4(%esp), %xmm0          # xmm0 = mem[0],zero
  vbroadcastsd	%xmm0, %zmm0


https://reviews.llvm.org/D38316

Files:
  lib/Transforms/InstCombine/InstCombineVectorOps.cpp
  test/Transforms/InstCombine/insert-extract-shuffle.ll


Index: test/Transforms/InstCombine/insert-extract-shuffle.ll
===================================================================

--- test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -283,3 +283,31 @@
   %ret = select i1 %e, <4 x i32> %b, <4 x i32> zeroinitializer
   ret <4 x i32> %ret
 }
+
+; insert undef, (bitcast vType X to scalar), C -->  bitcast (shuffle X, undef, Mask)
+; PR34716 - https://bugs.llvm.org/show_bug.cgi?id=34716
+
+define <2 x i64> @bitcast_vector_and_insert(<2 x float> %x){
+; CHECK-LABEL: @bitcast_vector_and_insert(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> %x, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[VEC:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[VEC]]
+;
+  %bc = bitcast <2 x float> %x to i64
+  %vec = insertelement <2 x i64> undef, i64 %bc, i32 0
+  ret <2 x i64> %vec
+}
+
+; Use weird types to show the shuffle mask is still correct for unusual cases.
+
+define <4 x i72> @bitcast_vector_and_insert_odd(<3 x i24> %x){
+; CHECK-LABEL: @bitcast_vector_and_insert_odd(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i24> %x, <3 x i24> undef, <12 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[VEC:%.*]] = bitcast <12 x i24> [[TMP1]] to <4 x i72>
+; CHECK-NEXT:    ret <4 x i72> [[VEC]]
+;
+  %bc = bitcast <3 x i24> %x to i72
+  %vec = insertelement <4 x i72> undef, i72 %bc, i32 2
+  ret <4 x i72> %vec
+}
+
Index: lib/Transforms/InstCombine/InstCombineVectorOps.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -824,6 +824,38 @@
   if (Instruction *Broadcast = foldInsSequenceIntoBroadcast(IE))
     return Broadcast;
 
+  Value *X;
+  if (match(ScalarOp, m_OneUse(m_BitCast(m_Value(X)))) &&
+      isa<UndefValue>(VecOp)) {
+    auto *InsertIndex = dyn_cast<ConstantInt>(IdxOp);
+    auto *SrcVecTy = dyn_cast<VectorType>(X->getType());
+    if (InsertIndex && SrcVecTy) {
+      // We're bitcasting from vector to scalar and then inserting into a larger
+      // undef vector. This is really a subvector extension with undef elements,
+      // so use a size-extending shuffle to avoid the scalar conversion:
+      //   insert undef, (bitcast vType X to scalar), C -->
+      //   bitcast (shuffle X, undef, Mask)
+      unsigned NumSrcElts = SrcVecTy->getNumElements();
+      unsigned ExtRatio = IE.getType()->getBitWidth() / SrcVecTy->getBitWidth();
+      unsigned NumMaskVals = NumSrcElts * ExtRatio;
+      SmallVector<Constant *, 16> MaskValues(NumMaskVals);
+      for (unsigned i = 0; i != NumMaskVals; ++i) {
+        // The bitcast maps all of the elements of the source vector into one
+        // larger element of the resulting vector. All other elements are
+        // undefined. Example:
+        // insert <2 x i16> undef, (bitcast <2 x i8> X to i16), i32 0 -->
+        // bitcast (shuffle <2 x i8> X, undef, <0,1,-1,-1>) to <2 x i16>
+        if (i / NumSrcElts == InsertIndex->getZExtValue())
+          MaskValues[i] = Builder.getInt32(i % NumSrcElts);
+        else
+          MaskValues[i] = UndefValue::get(Builder.getInt32Ty());
+      }
+      Value *SV = Builder.CreateShuffleVector(X, UndefValue::get(SrcVecTy),
+                                              ConstantVector::get(MaskValues));
+      return new BitCastInst(SV, IE.getType());
+    }
+  }
+
   return nullptr;
 }
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D38316.116804.patch
Type: text/x-patch
Size: 3660 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20170927/10f0fc32/attachment.bin>