[llvm] 94795a3 - [VectorCombine] foldBitcastShuf - add support for length changing shuffles

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 6 04:06:04 PDT 2023


Author: Simon Pilgrim
Date: 2023-10-06T11:59:51+01:00
New Revision: 94795a37e892cfedb570c70a5101ea88348e60c7

URL: https://github.com/llvm/llvm-project/commit/94795a37e892cfedb570c70a5101ea88348e60c7
DIFF: https://github.com/llvm/llvm-project/commit/94795a37e892cfedb570c70a5101ea88348e60c7.diff

LOG: [VectorCombine] foldBitcastShuf - add support for length changing shuffles

Allow length changing shuffle masks in the "bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'" fold.

It also exposes some poor shuffle mask detection for extract/insert subvector cases inside improveShuffleKindFromMask

First stage towards addressing Issue #67803

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/X86/shuffle-inseltpoison.ll
    llvm/test/Transforms/VectorCombine/X86/shuffle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 286120ed534b776..ca7bb69b17f7c18 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -689,15 +689,18 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
   // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
   // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
   // mask for scalable type is a splat or not.
-  // 2) Disallow non-vector casts and length-changing shuffles.
+  // 2) Disallow non-vector casts.
   // TODO: We could allow any shuffle.
+  auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
   auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
-  if (!SrcTy || I.getOperand(0)->getType() != SrcTy)
+  if (!DestTy || !SrcTy)
     return false;
 
-  auto *DestTy = cast<FixedVectorType>(I.getType());
   unsigned DestEltSize = DestTy->getScalarSizeInBits();
   unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
+  if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
+    return false;
+
   SmallVector<int, 16> NewMask;
   if (DestEltSize <= SrcEltSize) {
     // The bitcast is from wide to narrow/equal elements. The shuffle mask can
@@ -714,10 +717,15 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
       return false;
   }
 
+  // Bitcast the shuffle src - keep its original width but using the destination
+  // scalar type.
+  unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
+  auto *ShuffleTy = FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
+
   // The new shuffle must not cost more than the old shuffle. The bitcast is
   // moved ahead of the shuffle, so assume that it has the same cost as before.
   InstructionCost DestCost = TTI.getShuffleCost(
-      TargetTransformInfo::SK_PermuteSingleSrc, DestTy, NewMask);
+      TargetTransformInfo::SK_PermuteSingleSrc, ShuffleTy, NewMask);
   InstructionCost SrcCost =
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask);
   if (DestCost > SrcCost || !DestCost.isValid())
@@ -725,7 +733,7 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) {
 
   // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
   ++NumShufOfBitcast;
-  Value *CastV = Builder.CreateBitCast(V, DestTy);
+  Value *CastV = Builder.CreateBitCast(V, ShuffleTy);
   Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
   replaceValue(I, *Shuf);
   return true;

diff  --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-inseltpoison.ll
index 318aa33d6b11c38..471424dfaca2d55 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-inseltpoison.ll
@@ -33,13 +33,18 @@ define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
   ret <4 x float> %r
 }
 
-; TODO - length-changing shuffle
+; Length-changing shuffles
 
 define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) {
-; CHECK-LABEL: @bitcast_shuf_narrow_element_subvector(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
-; CHECK-NEXT:    ret <16 x i8> [[R]]
+; SSE-LABEL: @bitcast_shuf_narrow_element_subvector(
+; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; SSE-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
+; SSE-NEXT:    ret <16 x i8> [[R]]
+;
+; AVX-LABEL: @bitcast_shuf_narrow_element_subvector(
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    ret <16 x i8> [[R]]
 ;
   %shuf = shufflevector <2 x i32> %v, <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   %r = bitcast <4 x i32> %shuf to <16 x i8>
@@ -47,10 +52,15 @@ define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) {
 }
 
 define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) {
-; CHECK-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
-; CHECK-NEXT:    ret <16 x i16> [[R]]
+; SSE-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
+; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; SSE-NEXT:    [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
+; SSE-NEXT:    ret <16 x i16> [[R]]
+;
+; AVX-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <8 x i16>
+; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <16 x i16> [[R]]
 ;
   %shuf = shufflevector <2 x i64> %v, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %r = bitcast <4 x i64> %shuf to <16 x i16>
@@ -58,10 +68,15 @@ define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) {
 }
 
 define <16 x i8> @bitcast_shuf_extract_subvector(<8 x i32> %v) {
-; CHECK-LABEL: @bitcast_shuf_extract_subvector(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
-; CHECK-NEXT:    ret <16 x i8> [[R]]
+; SSE-LABEL: @bitcast_shuf_extract_subvector(
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32> [[V:%.*]] to <32 x i8>
+; SSE-NEXT:    [[R:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:    ret <16 x i8> [[R]]
+;
+; AVX-LABEL: @bitcast_shuf_extract_subvector(
+; AVX-NEXT:    [[SHUF:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
+; AVX-NEXT:    ret <16 x i8> [[R]]
 ;
   %shuf = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %r = bitcast <4 x i32> %shuf to <16 x i8>

diff  --git a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
index e094d62c80a5ed2..1e0a5ec187e5520 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
@@ -33,13 +33,18 @@ define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
   ret <4 x float> %r
 }
 
-; TODO - Length-changing shuffle
+; Length-changing shuffles
 
 define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) {
-; CHECK-LABEL: @bitcast_shuf_narrow_element_subvector(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
-; CHECK-NEXT:    ret <16 x i8> [[R]]
+; SSE-LABEL: @bitcast_shuf_narrow_element_subvector(
+; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; SSE-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
+; SSE-NEXT:    ret <16 x i8> [[R]]
+;
+; AVX-LABEL: @bitcast_shuf_narrow_element_subvector(
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    ret <16 x i8> [[R]]
 ;
   %shuf = shufflevector <2 x i32> %v, <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   %r = bitcast <4 x i32> %shuf to <16 x i8>
@@ -47,10 +52,15 @@ define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) {
 }
 
 define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) {
-; CHECK-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
-; CHECK-NEXT:    ret <16 x i16> [[R]]
+; SSE-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
+; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; SSE-NEXT:    [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
+; SSE-NEXT:    ret <16 x i16> [[R]]
+;
+; AVX-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <8 x i16>
+; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    ret <16 x i16> [[R]]
 ;
   %shuf = shufflevector <2 x i64> %v, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   %r = bitcast <4 x i64> %shuf to <16 x i16>
@@ -58,10 +68,15 @@ define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) {
 }
 
 define <16 x i8> @bitcast_shuf_extract_subvector(<8 x i32> %v) {
-; CHECK-LABEL: @bitcast_shuf_extract_subvector(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
-; CHECK-NEXT:    ret <16 x i8> [[R]]
+; SSE-LABEL: @bitcast_shuf_extract_subvector(
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32> [[V:%.*]] to <32 x i8>
+; SSE-NEXT:    [[R:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:    ret <16 x i8> [[R]]
+;
+; AVX-LABEL: @bitcast_shuf_extract_subvector(
+; AVX-NEXT:    [[SHUF:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
+; AVX-NEXT:    ret <16 x i8> [[R]]
 ;
   %shuf = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %r = bitcast <4 x i32> %shuf to <16 x i8>


        


More information about the llvm-commits mailing list