[llvm] 7f4f237 - [VectorCombine] foldShuffleOfShuffles - add missing arguments to getShuffleCost calls.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 23 03:53:32 PDT 2024
Author: Simon Pilgrim
Date: 2024-04-23T11:53:08+01:00
New Revision: 7f4f237cd8510b74230d6fd4b5c4610a3d88dd3f
URL: https://github.com/llvm/llvm-project/commit/7f4f237cd8510b74230d6fd4b5c4610a3d88dd3f
DIFF: https://github.com/llvm/llvm-project/commit/7f4f237cd8510b74230d6fd4b5c4610a3d88dd3f.diff
LOG: [VectorCombine] foldShuffleOfShuffles - add missing arguments to getShuffleCost calls.
Ensure the getShuffleCost arguments/instruction args are populated - minor extension to #88743 to help improve shuffle costs for certain corner cases (e.g. shuffles of loads)
Added:
Modified:
llvm/lib/Transforms/Vectorize/VectorCombine.cpp
llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index df761f9b711a10..da03a69708ddfc 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1566,6 +1566,8 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
m_Mask(OuterMask))))
return false;
+ auto *ShufI0 = dyn_cast<Instruction>(I.getOperand(0));
+ auto *ShufI1 = dyn_cast<Instruction>(I.getOperand(1));
auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(V0->getType());
auto *ShuffleImmTy = dyn_cast<FixedVectorType>(I.getOperand(0)->getType());
@@ -1607,14 +1609,15 @@ bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
InstructionCost OldCost =
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
- InnerMask0, CostKind) +
+ InnerMask0, CostKind, 0, nullptr, {V0, U0}, ShufI0) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, ShuffleSrcTy,
- InnerMask1, CostKind) +
+ InnerMask1, CostKind, 0, nullptr, {V1, U1}, ShufI1) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleImmTy,
- OuterMask, CostKind, 0, nullptr, std::nullopt, &I);
+ OuterMask, CostKind, 0, nullptr, {ShufI0, ShufI1}, &I);
- InstructionCost NewCost = TTI.getShuffleCost(
- TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy, NewMask, CostKind);
+ InstructionCost NewCost =
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleSrcTy,
+ NewMask, CostKind, 0, nullptr, {V0, V1});
LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index 21d9d1cee5d1fa..57df36a3874a9c 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
; fold to identity
@@ -44,22 +44,17 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
ret <8 x i32> %concat
}
+; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
+
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; SSE-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
-; SSE-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
-; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; SSE-NEXT: ret <4 x double> [[BLEND]]
-;
-; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
-; AVX-NEXT: ret <4 x double> [[BLEND]]
+; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; CHECK-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; CHECK-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT: ret <4 x double> [[BLEND]]
;
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
More information about the llvm-commits
mailing list