[llvm] 8e31acf - [VectorCombine] Add special handling for truncating shuffles (#70013)

Tue Oct 24 07:03:47 PDT 2023

Author: Nabeel Omer
Date: 2023-10-24T15:03:43+01:00
New Revision: 8e31acf8ca0db5e6160d85e266432f78148d38f5

URL: https://github.com/llvm/llvm-project/commit/8e31acf8ca0db5e6160d85e266432f78148d38f5
DIFF: https://github.com/llvm/llvm-project/commit/8e31acf8ca0db5e6160d85e266432f78148d38f5.diff

LOG: [VectorCombine] Add special handling for truncating shuffles (#70013)

When dealing with a truncating shuffle, we can end up in a situation
where the type passed to getShuffleCost is the type of the result of the
shuffle, and the mask references an element which is out of bounds of
the result vector.

If dealing with truncating shuffles, pass the type of the input vectors
to `getShuffleCost()` in order to avoid an out-of-bounds assertion.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/X86/reduction-two-vecs-combine.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 16efc3b2336f2a5..51163352b1e6876 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1472,21 +1472,28 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
       dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType());
   if (!ShuffleInputType)
     return false;
-  int NumInputElts = ShuffleInputType->getNumElements();
+  unsigned NumInputElts = ShuffleInputType->getNumElements();
 
   // Find the mask from sorting the lanes into order. This is most likely to
   // become a identity or concat mask. Undef elements are pushed to the end.
   SmallVector<int> ConcatMask;
   Shuffle->getShuffleMask(ConcatMask);
   sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
+  // In the case of a truncating shuffle it's possible for the mask
+  // to have an index greater than the size of the resulting vector.
+  // This requires special handling.
+  bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts;
   bool UsesSecondVec =
-      any_of(ConcatMask, [&](int M) { return M >= NumInputElts; });
+      any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
+
+  FixedVectorType *VecTyForCost =
+      (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType;
   InstructionCost OldCost = TTI.getShuffleCost(
       UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
-      UsesSecondVec ? VecType : ShuffleInputType, Shuffle->getShuffleMask());
+      VecTyForCost, Shuffle->getShuffleMask());
   InstructionCost NewCost = TTI.getShuffleCost(
       UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
-      UsesSecondVec ? VecType : ShuffleInputType, ConcatMask);
+      VecTyForCost, ConcatMask);
 
   LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
                     << "\n");

diff  --git a/llvm/test/Transforms/VectorCombine/X86/reduction-two-vecs-combine.ll b/llvm/test/Transforms/VectorCombine/X86/reduction-two-vecs-combine.ll
index 45ec3f511ebc84a..a0945ab81b0f72d 100644
--- a/llvm/test/Transforms/VectorCombine/X86/reduction-two-vecs-combine.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/reduction-two-vecs-combine.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S --passes=vector-combine -mtriple=x86_64-unknown-linux < %s | FileCheck %s
+; RUN: opt -S --passes=vector-combine -mtriple=x86_64-sie-ps5 < %s | FileCheck %s
 
 define i16 @test_spill_mixed() {
 ; CHECK-LABEL: define i16 @test_spill_mixed() {
@@ -14,4 +15,17 @@ entry:
   ret i16 0
 }
 
+define i16 @crash() {
+; CHECK-LABEL: define i16 @crash() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i32> zeroinitializer, <8 x i32> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 9>
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
+; CHECK-NEXT:    ret i16 0
+;
+entry:
+  %0 = shufflevector <8 x i32> zeroinitializer, <8 x i32> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 9>
+  %1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0)
+  ret i16 0
+}
+
 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)