[llvm] 538a8f0 - [InstCombine] convert bitcast-shuffle to vector trunc

Sun Apr 5 06:48:22 PDT 2020

Author: Sanjay Patel
Date: 2020-04-05T09:48:02-04:00
New Revision: 538a8f02271b6de817a6b65e3b70f9f1fd6e428d

URL: https://github.com/llvm/llvm-project/commit/538a8f02271b6de817a6b65e3b70f9f1fd6e428d
DIFF: https://github.com/llvm/llvm-project/commit/538a8f02271b6de817a6b65e3b70f9f1fd6e428d.diff

LOG: [InstCombine] convert bitcast-shuffle to vector trunc

As discussed in D76983, that patch can turn a chain of insert/extract
with scalar trunc ops into bitcast+extract and existing instcombine
vector transforms end up creating a shuffle out of that (see the
PhaseOrdering test for an example). Currently, that process requires
at least this sequence: -instcombine -early-cse -instcombine.

Before D76983, the sequence of insert/extract would reach the SLP
vectorizer and become a vector trunc there.

Based on a small sampling of public targets/types, converting the
shuffle to a trunc is better for codegen in most cases (and a
regression of that form is the reason this was noticed). The trunc is
clearly better for IR-level analysis as well.

This means that we can induce "spontaneous vectorization" without
invoking any explicit vectorizer passes (at least a vector cast op
may be created out of scalar casts), but that seems to be the right
choice given that we started with a chain of insert/extract, and the
backend would expand back to that chain if a target does not support
the op.

Differential Revision: https://reviews.llvm.org/D77299

Added: 
    

Modified: 
    llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
    llvm/test/Transforms/InstCombine/shuffle-cast.ll
    llvm/test/Transforms/PhaseOrdering/vector-trunc.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 1e95f235b653..da5a910ca9fa 100644

--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1657,6 +1657,47 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
   return NewBO;
 }
 
+/// Convert a narrowing shuffle of a bitcasted vector into a vector truncate.
+/// Example (little endian):
+/// shuf (bitcast <4 x i16> X to <8 x i8>), <0, 2, 4, 6> --> trunc X to <4 x i8>
+static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf,
+                                     bool IsBigEndian) {
+  // This must be a bitcasted shuffle of 1 vector integer operand.
+  Type *DestType = Shuf.getType();
+  Value *X;
+  if (!match(Shuf.getOperand(0), m_BitCast(m_Value(X))) ||
+      !match(Shuf.getOperand(1), m_Undef()) || !DestType->isIntOrIntVectorTy())
+    return nullptr;
+
+  // The source type must have the same number of elements as the shuffle,
+  // and the source element type must be larger than the shuffle element type.
+  Type *SrcType = X->getType();
+  if (!SrcType->isVectorTy() || !SrcType->isIntOrIntVectorTy() ||
+      SrcType->getVectorNumElements() != DestType->getVectorNumElements() ||
+      SrcType->getScalarSizeInBits() % DestType->getScalarSizeInBits() != 0)
+    return nullptr;
+
+  assert(Shuf.changesLength() && !Shuf.increasesLength() &&
+         "Expected a shuffle that decreases length");
+
+  // Last, check that the mask chooses the correct low bits for each narrow
+  // element in the result.
+  uint64_t TruncRatio =
+      SrcType->getScalarSizeInBits() / DestType->getScalarSizeInBits();
+  ArrayRef<int> Mask = Shuf.getShuffleMask();
+  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+    if (Mask[i] == UndefMaskElem)
+      continue;
+    uint64_t LSBIndex = IsBigEndian ? (i + 1) * TruncRatio - 1 : i * TruncRatio;
+    assert(LSBIndex <= std::numeric_limits<int32_t>::max() &&
+           "Overflowed 32-bits");
+    if (Mask[i] != (int)LSBIndex)
+      return nullptr;
+  }
+
+  return new TruncInst(X, DestType);
+}
+
 /// Match a shuffle-select-shuffle pattern where the shuffles are widening and
 /// narrowing (concatenating with undef and extracting back to the original
 /// length). This allows replacing the wide select with a narrow select.
@@ -1951,6 +1992,9 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (Instruction *I = foldSelectShuffle(SVI, Builder, DL))
     return I;
 
+  if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian()))
+    return I;
+
   if (Instruction *I = narrowVectorSelect(SVI, Builder))
     return I;
 

diff  --git a/llvm/test/Transforms/InstCombine/shuffle-cast.ll b/llvm/test/Transforms/InstCombine/shuffle-cast.ll
index e4b21ff9e111..fc3b4c1241a0 100644
--- a/llvm/test/Transforms/InstCombine/shuffle-cast.ll
+++ b/llvm/test/Transforms/InstCombine/shuffle-cast.ll
@@ -3,10 +3,14 @@
 ; RUN: opt < %s -instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=ANY,BE
 
 define <4 x i16> @trunc_little_endian(<4 x i32> %x) {
-; ANY-LABEL: @trunc_little_endian(
-; ANY-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[X:%.*]] to <8 x i16>
-; ANY-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; ANY-NEXT:    ret <4 x i16> [[R]]
+; LE-LABEL: @trunc_little_endian(
+; LE-NEXT:    [[R:%.*]] = trunc <4 x i32> [[X:%.*]] to <4 x i16>
+; LE-NEXT:    ret <4 x i16> [[R]]
+;
+; BE-LABEL: @trunc_little_endian(
+; BE-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[X:%.*]] to <8 x i16>
+; BE-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; BE-NEXT:    ret <4 x i16> [[R]]
 ;
   %b = bitcast <4 x i32> %x to <8 x i16>
   %r = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -14,10 +18,14 @@ define <4 x i16> @trunc_little_endian(<4 x i32> %x) {
 }
 
 define <4 x i16> @trunc_big_endian(<4 x i32> %x) {
-; ANY-LABEL: @trunc_big_endian(
-; ANY-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[X:%.*]] to <8 x i16>
-; ANY-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; ANY-NEXT:    ret <4 x i16> [[R]]
+; LE-LABEL: @trunc_big_endian(
+; LE-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[X:%.*]] to <8 x i16>
+; LE-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; LE-NEXT:    ret <4 x i16> [[R]]
+;
+; BE-LABEL: @trunc_big_endian(
+; BE-NEXT:    [[R:%.*]] = trunc <4 x i32> [[X:%.*]] to <4 x i16>
+; BE-NEXT:    ret <4 x i16> [[R]]
 ;
   %b = bitcast <4 x i32> %x to <8 x i16>
   %r = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -26,12 +34,20 @@ define <4 x i16> @trunc_big_endian(<4 x i32> %x) {
 
 declare void @use_v8i16(<8 x i16>)
 
+; Extra use is ok.
+
 define <2 x i16> @trunc_little_endian_extra_use(<2 x i64> %x) {
-; ANY-LABEL: @trunc_little_endian_extra_use(
-; ANY-NEXT:    [[B:%.*]] = bitcast <2 x i64> [[X:%.*]] to <8 x i16>
-; ANY-NEXT:    call void @use_v8i16(<8 x i16> [[B]])
-; ANY-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <2 x i32> <i32 0, i32 4>
-; ANY-NEXT:    ret <2 x i16> [[R]]
+; LE-LABEL: @trunc_little_endian_extra_use(
+; LE-NEXT:    [[B:%.*]] = bitcast <2 x i64> [[X:%.*]] to <8 x i16>
+; LE-NEXT:    call void @use_v8i16(<8 x i16> [[B]])
+; LE-NEXT:    [[R:%.*]] = trunc <2 x i64> [[X]] to <2 x i16>
+; LE-NEXT:    ret <2 x i16> [[R]]
+;
+; BE-LABEL: @trunc_little_endian_extra_use(
+; BE-NEXT:    [[B:%.*]] = bitcast <2 x i64> [[X:%.*]] to <8 x i16>
+; BE-NEXT:    call void @use_v8i16(<8 x i16> [[B]])
+; BE-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+; BE-NEXT:    ret <2 x i16> [[R]]
 ;
   %b = bitcast <2 x i64> %x to <8 x i16>
   call void @use_v8i16(<8 x i16> %b)
@@ -41,12 +57,20 @@ define <2 x i16> @trunc_little_endian_extra_use(<2 x i64> %x) {
 
 declare void @use_v12i11(<12 x i11>)
 
+; Weird types are ok.
+
 define <4 x i11> @trunc_big_endian_extra_use(<4 x i33> %x) {
-; ANY-LABEL: @trunc_big_endian_extra_use(
-; ANY-NEXT:    [[B:%.*]] = bitcast <4 x i33> [[X:%.*]] to <12 x i11>
-; ANY-NEXT:    call void @use_v12i11(<12 x i11> [[B]])
-; ANY-NEXT:    [[R:%.*]] = shufflevector <12 x i11> [[B]], <12 x i11> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; ANY-NEXT:    ret <4 x i11> [[R]]
+; LE-LABEL: @trunc_big_endian_extra_use(
+; LE-NEXT:    [[B:%.*]] = bitcast <4 x i33> [[X:%.*]] to <12 x i11>
+; LE-NEXT:    call void @use_v12i11(<12 x i11> [[B]])
+; LE-NEXT:    [[R:%.*]] = shufflevector <12 x i11> [[B]], <12 x i11> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; LE-NEXT:    ret <4 x i11> [[R]]
+;
+; BE-LABEL: @trunc_big_endian_extra_use(
+; BE-NEXT:    [[B:%.*]] = bitcast <4 x i33> [[X:%.*]] to <12 x i11>
+; BE-NEXT:    call void @use_v12i11(<12 x i11> [[B]])
+; BE-NEXT:    [[R:%.*]] = trunc <4 x i33> [[X]] to <4 x i11>
+; BE-NEXT:    ret <4 x i11> [[R]]
 ;
   %b = bitcast <4 x i33> %x to <12 x i11>
   call void @use_v12i11(<12 x i11> %b)
@@ -54,3 +78,46 @@ define <4 x i11> @trunc_big_endian_extra_use(<4 x i33> %x) {
   ret <4 x i11> %r
 }
 
+define <4 x i16> @wrong_cast1(i128 %x) {
+; ANY-LABEL: @wrong_cast1(
+; ANY-NEXT:    [[B:%.*]] = bitcast i128 [[X:%.*]] to <8 x i16>
+; ANY-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; ANY-NEXT:    ret <4 x i16> [[R]]
+;
+  %b = bitcast i128 %x to <8 x i16>
+  %r = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %r
+}
+
+define <4 x i16> @wrong_cast2(<4 x float> %x) {
+; ANY-LABEL: @wrong_cast2(
+; ANY-NEXT:    [[B:%.*]] = bitcast <4 x float> [[X:%.*]] to <8 x i16>
+; ANY-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; ANY-NEXT:    ret <4 x i16> [[R]]
+;
+  %b = bitcast <4 x float> %x to <8 x i16>
+  %r = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %r
+}
+
+define <4 x half> @wrong_cast3(<4 x i32> %x) {
+; ANY-LABEL: @wrong_cast3(
+; ANY-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[X:%.*]] to <8 x half>
+; ANY-NEXT:    [[R:%.*]] = shufflevector <8 x half> [[B]], <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; ANY-NEXT:    ret <4 x half> [[R]]
+;
+  %b = bitcast <4 x i32> %x to <8 x half>
+  %r = shufflevector <8 x half> %b, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x half> %r
+}
+
+define <2 x i16> @wrong_shuffle(<4 x i32> %x) {
+; ANY-LABEL: @wrong_shuffle(
+; ANY-NEXT:    [[B:%.*]] = bitcast <4 x i32> [[X:%.*]] to <8 x i16>
+; ANY-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> undef, <2 x i32> <i32 0, i32 2>
+; ANY-NEXT:    ret <2 x i16> [[R]]
+;
+  %b = bitcast <4 x i32> %x to <8 x i16>
+  %r = shufflevector <8 x i16> %b, <8 x i16> undef, <2 x i32> <i32 0, i32 2>
+  ret <2 x i16> %r
+}

diff  --git a/llvm/test/Transforms/PhaseOrdering/vector-trunc.ll b/llvm/test/Transforms/PhaseOrdering/vector-trunc.ll
index 494b9a7a6c0b..52a1fe7d897d 100644
--- a/llvm/test/Transforms/PhaseOrdering/vector-trunc.ll
+++ b/llvm/test/Transforms/PhaseOrdering/vector-trunc.ll
@@ -4,8 +4,7 @@
 
 define <4 x i16> @truncate(<4 x i32> %x) {
 ; ANY-LABEL: @truncate(
-; ANY-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[X:%.*]] to <8 x i16>
-; ANY-NEXT:    [[V3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; ANY-NEXT:    [[V3:%.*]] = trunc <4 x i32> [[X:%.*]] to <4 x i16>
 ; ANY-NEXT:    ret <4 x i16> [[V3]]
 ;
   %x0 = extractelement <4 x i32> %x, i32 0