[llvm] [SLP]Add external uses cost for the gathered loads. (PR #99889)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 26 08:08:58 PDT 2024
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/99889
>From f15636340421846dcd1f32f7a6e7397592521bf7 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Mon, 22 Jul 2024 15:20:26 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 6 ++++
.../X86/reduced-gathered-vectorized.ll | 22 ++++++-------
.../vec_list_bias_external_insert_shuffled.ll | 32 ++++++++++---------
3 files changed, 32 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 667c4eb311c22..f3194fe268e21 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8363,6 +8363,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
: TTI.getStridedMemoryOpCost(
Instruction::Load, LoadTy, LI->getPointerOperand(),
/*VariableMask=*/false, Alignment, CostKind, LI);
+ // Add external uses costs.
+ for (auto [Idx, V] : enumerate(VL.slice(
+ P.first, std::min<unsigned>(VL.size() - P.first, VF))))
+ if (!R.areAllUsersVectorized(cast<Instruction>(V)))
+ GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement,
+ LoadTy, CostKind, Idx);
// Estimate GEP cost.
SmallVector<Value *> PointerOps(VF);
for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
index 4dea52357e04f..31f0e065cf77d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-gathered-vectorized.ll
@@ -6,23 +6,19 @@ define i16 @test() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 5
; CHECK-NEXT: [[A1:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 6
-; CHECK-NEXT: [[A2:%.*]] = getelementptr [1000 x i64], ptr null, i64 0, i64 7
; CHECK-NEXT: br label [[WHILE:%.*]]
; CHECK: while:
-; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX5:%.*]], [[WHILE]] ]
+; CHECK-NEXT: [[PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX26:%.*]], [[WHILE]] ]
; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr null, align 8
; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[A1]], align 16
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[A2]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr null, align 8
-; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i64>, ptr [[A]], align 8
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i64> [[TMP5]], i64 [[TMP0]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i64> [[TMP6]], i64 [[TMP1]], i32 3
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i64> [[TMP7]], <8 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 4, i32 5, i32 8, i32 8>
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> [[TMP10]])
-; CHECK-NEXT: [[OP_RDX5]] = xor i64 [[TMP3]], [[TMP11]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr null, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, ptr [[A]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> [[TMP4]])
+; CHECK-NEXT: [[OP_RDX:%.*]] = xor i64 0, [[TMP2]]
+; CHECK-NEXT: [[OP_RDX24:%.*]] = xor i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[OP_RDX25:%.*]] = xor i64 [[OP_RDX]], [[OP_RDX24]]
+; CHECK-NEXT: [[OP_RDX26]] = xor i64 [[OP_RDX25]], [[TMP5]]
; CHECK-NEXT: br label [[WHILE]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
index 8f1d7a11e1509..69ecf1852aedd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
@@ -7,9 +7,11 @@ define void @test(ptr nocapture %t2) {
; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
; CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T4]], align 4
; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
+; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4
; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
; CHECK-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
+; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
+; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4
; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
; CHECK-NEXT: [[T17:%.*]] = load i32, ptr [[T16]], align 4
; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -19,11 +21,10 @@ define void @test(ptr nocapture %t2) {
; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT: [[T9:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT: [[T15:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270
; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137
; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
@@ -33,19 +34,20 @@ define void @test(ptr nocapture %t2) {
; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T27]], i32 2
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T47]], i32 3
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> <i32 1, i32 0, i32 6, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T40]], i32 3
-; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP6]], [[TMP8]]
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 poison, i32 poison, i32 3>
-; CHECK-NEXT: [[T701:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[T50]], i32 5
+; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
+; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
-; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
+; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: store <8 x i32> [[T76]], ptr [[T2]], align 4
; CHECK-NEXT: ret void
;
More information about the llvm-commits
mailing list