[llvm] b6050ca - [VectorCombine] transform bitcasted shuffle to narrower elements

Thu Apr 2 10:30:30 PDT 2020

Author: Sanjay Patel
Date: 2020-04-02T13:30:22-04:00
New Revision: b6050ca181684997d0284ca39a78adc6833ddfc4

URL: https://github.com/llvm/llvm-project/commit/b6050ca181684997d0284ca39a78adc6833ddfc4
DIFF: https://github.com/llvm/llvm-project/commit/b6050ca181684997d0284ca39a78adc6833ddfc4.diff

LOG: [VectorCombine] transform bitcasted shuffle to narrower elements

bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'

We do not attempt this in InstCombine because we do not want to change
types and create new shuffle ops that are potentially not lowered as
well as the original code. Here, we can check the cost model to see if
it is worthwhile.

I've aggressively enabled this transform even if the types are the same
size and/or equal cost because moving the bitcast allows InstCombine to
make further simplifications.

In the motivating cases from PR35454:
https://bugs.llvm.org/show_bug.cgi?id=35454
...this is enough to let instcombine and the backend eliminate the
redundant shuffles, but we probably want to extend VectorCombine to
handle the inverse pattern (shuffle-of-bitcast) to get that
simplification directly in IR.

Differential Revision: https://reviews.llvm.org/D76727

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/X86/shuffle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 70ee8cc233cc..66095af973aa 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -244,6 +245,49 @@ static bool foldExtractExtract(Instruction &I, const TargetTransformInfo &TTI) {
   return true;
 }
 
+/// If this is a bitcast to narrow elements from a shuffle of wider elements,
+/// try to bitcast the source vector to the narrow type followed by shuffle.
+/// This can enable further transforms by moving bitcasts or shuffles together.
+static bool foldBitcastShuf(Instruction &I, const TargetTransformInfo &TTI) {
+  Value *V;
+  ArrayRef<int> Mask;
+  if (!match(&I, m_BitCast(m_OneUse(m_ShuffleVector(m_Value(V), m_Undef(),
+                                                    m_Mask(Mask))))))
+    return false;
+
+  Type *DestTy = I.getType();
+  Type *SrcTy = V->getType();
+  if (!DestTy->isVectorTy() || I.getOperand(0)->getType() != SrcTy)
+    return false;
+
+  // TODO: Handle bitcast from narrow element type to wide element type.
+  assert(SrcTy->isVectorTy() && "Shuffle of non-vector type?");
+  unsigned DestNumElts = DestTy->getVectorNumElements();
+  unsigned SrcNumElts = SrcTy->getVectorNumElements();
+  if (SrcNumElts > DestNumElts)
+    return false;
+
+  // The new shuffle must not cost more than the old shuffle. The bitcast is
+  // moved ahead of the shuffle, so assume that it has the same cost as before.
+  if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) >
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy))
+    return false;
+
+  // Bitcast the source vector and expand the shuffle mask to the equivalent for
+  // narrow elements.
+  // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
+  IRBuilder<> Builder(&I);
+  Value *CastV = Builder.CreateBitCast(V, DestTy);
+  SmallVector<int, 16> NewMask;
+  assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
+  unsigned ScaleFactor = DestNumElts / SrcNumElts;
+  scaleShuffleMask(ScaleFactor, Mask, NewMask);
+  Value *Shuf = Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy),
+                                            NewMask);
+  I.replaceAllUsesWith(Shuf);
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager 
diff erences are
 /// handled in the callers of this function.
 static bool runImpl(Function &F, const TargetTransformInfo &TTI,
@@ -265,6 +309,7 @@ static bool runImpl(Function &F, const TargetTransformInfo &TTI,
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       MadeChange |= foldExtractExtract(I, TTI);
+      MadeChange |= foldBitcastShuf(I, TTI);
     }
   }
 

diff  --git a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
index b5516544638b..5a758d78d937 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
@@ -2,28 +2,39 @@
 ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
 
+; x86 does not have a cheap v16i8 shuffle until SSSE3 (pshufb)
+
 define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) {
-; CHECK-LABEL: @bitcast_shuf_narrow_element(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
-; CHECK-NEXT:    ret <16 x i8> [[R]]
+; SSE-LABEL: @bitcast_shuf_narrow_element(
+; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
+; SSE-NEXT:    ret <16 x i8> [[R]]
+;
+; AVX-LABEL: @bitcast_shuf_narrow_element(
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    ret <16 x i8> [[TMP2]]
 ;
   %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %r = bitcast <4 x i32> %shuf to <16 x i8>
   ret <16 x i8> %r
 }
 
+; v4f32 is the same cost as v4i32, so this always works
+
 define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
 ; CHECK-LABEL: @bitcast_shuf_same_size(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <4 x float>
-; CHECK-NEXT:    ret <4 x float> [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <4 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %r = bitcast <4 x i32> %shuf to <4 x float>
   ret <4 x float> %r
 }
 
+; Negative test - length-changing shuffle
+
 define <16 x i8> @bitcast_shuf_narrow_element_wrong_size(<2 x i32> %v) {
 ; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_size(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
@@ -35,6 +46,8 @@ define <16 x i8> @bitcast_shuf_narrow_element_wrong_size(<2 x i32> %v) {
   ret <16 x i8> %r
 }
 
+; Negative test - must cast to vector type
+
 define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) {
 ; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_type(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -46,6 +59,8 @@ define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) {
   ret i128 %r
 }
 
+; Negative test - but might want to try this
+
 define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) {
 ; CHECK-LABEL: @bitcast_shuf_wide_element(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
@@ -59,6 +74,8 @@ define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) {
 
 declare void @use(<4 x i32>)
 
+; Negative test - don't create an extra shuffle
+
 define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) {
 ; CHECK-LABEL: @bitcast_shuf_uses(
 ; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -73,15 +90,25 @@ define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) {
 }
 
 define <2 x i64> @PR35454_1(<2 x i64> %v) {
-; CHECK-LABEL: @PR35454_1(
-; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8>
-; CHECK-NEXT:    [[ADD:%.*]] = shl <16 x i8> [[BC1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32>
-; CHECK-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[BC3]]
+; SSE-LABEL: @PR35454_1(
+; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; SSE-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8>
+; SSE-NEXT:    [[ADD:%.*]] = shl <16 x i8> [[BC1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; SSE-NEXT:    [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32>
+; SSE-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
+; SSE-NEXT:    ret <2 x i64> [[BC3]]
+;
+; AVX-LABEL: @PR35454_1(
+; AVX-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <16 x i8>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[ADD:%.*]] = shl <16 x i8> [[TMP2]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; AVX-NEXT:    [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32>
+; AVX-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
+; AVX-NEXT:    ret <2 x i64> [[BC3]]
 ;
   %bc = bitcast <2 x i64> %v to <4 x i32>
   %permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -94,15 +121,25 @@ define <2 x i64> @PR35454_1(<2 x i64> %v) {
 }
 
 define <2 x i64> @PR35454_2(<2 x i64> %v) {
-; CHECK-LABEL: @PR35454_2(
-; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16>
-; CHECK-NEXT:    [[ADD:%.*]] = shl <8 x i16> [[BC1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-; CHECK-NEXT:    [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32>
-; CHECK-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[BC3]]
+; SSE-LABEL: @PR35454_2(
+; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; SSE-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16>
+; SSE-NEXT:    [[ADD:%.*]] = shl <8 x i16> [[BC1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32>
+; SSE-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
+; SSE-NEXT:    ret <2 x i64> [[BC3]]
+;
+; AVX-LABEL: @PR35454_2(
+; AVX-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[BC]] to <8 x i16>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
+; AVX-NEXT:    [[ADD:%.*]] = shl <8 x i16> [[TMP2]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; AVX-NEXT:    [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32>
+; AVX-NEXT:    [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
+; AVX-NEXT:    ret <2 x i64> [[BC3]]
 ;
   %bc = bitcast <2 x i64> %v to <4 x i32>
   %permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>