[llvm] 7e054c3 - [VectorCombine] foldShuffleOfCastops - don't restrict to oneuse but compare total costs instead

Mon Jul 8 07:00:15 PDT 2024

Author: Simon Pilgrim
Date: 2024-07-08T14:57:51+01:00
New Revision: 7e054c33d42b0a6bc8f6b168dab688f8e7762ef0

URL: https://github.com/llvm/llvm-project/commit/7e054c33d42b0a6bc8f6b168dab688f8e7762ef0
DIFF: https://github.com/llvm/llvm-project/commit/7e054c33d42b0a6bc8f6b168dab688f8e7762ef0.diff

LOG: [VectorCombine] foldShuffleOfCastops - don't restrict to oneuse but compare total costs instead

Some casts (especially bitcasts but others as well) are incredibly cheap (or free), so don't limit the shuffle(cast(x),cast(y)) -> cast(shuffle(x,y)) to oneuse cases, but instead compare the total before/after costs of possibly repeating some casts.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index a748e7c4ef7b3..ebef29e8be206 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1491,8 +1491,7 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
 bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
   Value *V0, *V1;
   ArrayRef<int> OldMask;
-  if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
-                           m_Mask(OldMask))))
+  if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
     return false;
 
   auto *C0 = dyn_cast<CastInst>(V0);
@@ -1551,11 +1550,13 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
   // Try to replace a castop with a shuffle if the shuffle is not costly.
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
-  InstructionCost OldCost =
+  InstructionCost CostC0 =
       TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
-                           TTI::CastContextHint::None, CostKind) +
+                           TTI::CastContextHint::None, CostKind);
+  InstructionCost CostC1 =
       TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
                            TTI::CastContextHint::None, CostKind);
+  InstructionCost OldCost = CostC0 + CostC1;
   OldCost +=
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, CastDstTy,
                          OldMask, CostKind, 0, nullptr, std::nullopt, &I);
@@ -1564,6 +1565,10 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
       TargetTransformInfo::SK_PermuteTwoSrc, CastSrcTy, NewMask, CostKind);
   NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
                                   TTI::CastContextHint::None, CostKind);
+  if (!C0->hasOneUse())
+    NewCost += CostC0;
+  if (!C1->hasOneUse())
+    NewCost += CostC1;
 
   LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
                     << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost

diff  --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
index 3a5d2095e2b93..bc28259e6939b 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -224,14 +224,14 @@ define <16 x i16> @concat_bitcast_v4i32_v16i16(<4 x i32> %a0, <4 x i32> %a1) {
   ret <16 x i16> %r
 }
 
-; negative - multiuse
+; multiuse - ensure cost of any duplicated casts are worth it
 
 define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(<4 x i32> %a0, <4 x i32> %a1, ptr %a2) {
 ; CHECK-LABEL: define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(
 ; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], ptr [[A2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[X0:%.*]] = trunc <4 x i32> [[A0]] to <4 x i16>
-; CHECK-NEXT:    [[X1:%.*]] = trunc <4 x i32> [[A1]] to <4 x i16>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[X0]], <4 x i16> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
 ; CHECK-NEXT:    store <4 x i16> [[X0]], ptr [[A2]], align 8
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
@@ -242,6 +242,24 @@ define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(<4 x i32> %a0, <4 x i32> %a1
   ret <8 x i16> %r
 }
 
+; negative - multiuse - ensure cost of any duplicated casts are worth it
+
+define <16 x i8> @concat_trunc_v8i64_v16i8_multiuse(<8 x i64> %a0, <8 x i64> %a1, ptr %a2) {
+; CHECK-LABEL: define <16 x i8> @concat_trunc_v8i64_v16i8_multiuse(
+; CHECK-SAME: <8 x i64> [[A0:%.*]], <8 x i64> [[A1:%.*]], ptr [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[X0:%.*]] = trunc <8 x i64> [[A0]] to <8 x i8>
+; CHECK-NEXT:    [[X1:%.*]] = trunc <8 x i64> [[A1]] to <8 x i8>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i8> [[X0]], <8 x i8> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 4, i32 15>
+; CHECK-NEXT:    store <8 x i8> [[X0]], ptr [[A2]], align 8
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %x0 = trunc <8 x i64> %a0 to <8 x i8>
+  %x1 = trunc <8 x i64> %a1 to <8 x i8>
+  %r = shufflevector <8 x i8> %x0, <8 x i8> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 4, i32 15>
+  store <8 x i8> %x0, ptr %a2
+  ret <16 x i8> %r
+}
+
 ; negative - bitcasts (unscalable higher element count)
 
 define <16 x i16> @revpair_bitcast_v4i32_v16i16(<4 x i32> %a0, <4 x i32> %a1) {