[llvm] b9702bb - [LV] Consider insts feeding interleave group pointers free.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 19 09:07:13 PDT 2024
Author: Florian Hahn
Date: 2024-06-19T17:06:52+01:00
New Revision: b9702bb12f19b7bdb7f4420f94ee603d29d1bf1b
URL: https://github.com/llvm/llvm-project/commit/b9702bb12f19b7bdb7f4420f94ee603d29d1bf1b
DIFF: https://github.com/llvm/llvm-project/commit/b9702bb12f19b7bdb7f4420f94ee603d29d1bf1b.diff
LOG: [LV] Consider insts feeding interleave group pointers free.
For interleave groups, we only generate a pointer for the start of the
interleave group (the instruction at the insert position). The other
addresses for other members are alreayd considered free, but so are
their operands, if they are only used in address computations for
other interleave group members.
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 408083765ccb1..9571cfe358bf3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7048,6 +7048,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
// Ignore ephemeral values.
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+ SmallSetVector<Value *, 4> DeadInterleavePointerOps;
for (BasicBlock *BB : TheLoop->blocks())
for (Instruction &I : *BB) {
// Find all stores to invariant variables. Since they are going to sink
@@ -7058,25 +7059,32 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
ValuesToIgnore.insert(&I);
// For interleave groups, we only create a pointer for the start of the
- // interleave group. Mark single-use ops feeding interleave group mem ops
- // as free when vectorizing, expect the insert-pos memory op.
+ // interleave group. Queue up addresses of group members except the insert
+ // position for further processing.
if (isAccessInterleaved(&I)) {
auto *Group = getInterleavedAccessGroup(&I);
if (Group->getInsertPos() == &I)
continue;
Value *PointerOp = getLoadStorePointerOperand(&I);
- SmallSetVector<Value *, 4> Worklist;
- Worklist.insert(PointerOp);
- for (unsigned I = 0; I != Worklist.size(); ++I) {
- auto *Op = dyn_cast<Instruction>(Worklist[I]);
- if (!Op || !TheLoop->contains(Op) || !Op->hasOneUse())
- continue;
- VecValuesToIgnore.insert(Op);
- Worklist.insert(Op->op_begin(), Op->op_end());
- }
+ DeadInterleavePointerOps.insert(PointerOp);
}
}
+ // Mark ops feeding interleave group members as free, if they are only used
+ // by other dead computations.
+ for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
+ auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
+ if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
+ Instruction *UI = cast<Instruction>(U);
+ return !VecValuesToIgnore.contains(U) &&
+ (!isAccessInterleaved(UI) ||
+ getInterleavedAccessGroup(UI)->getInsertPos() == UI);
+ }))
+ continue;
+ VecValuesToIgnore.insert(Op);
+ DeadInterleavePointerOps.insert(Op->op_begin(), Op->op_end());
+ }
+
// Ignore type-promoting instructions we identified during reduction
// detection.
for (const auto &Reduction : Legal->getReductionVars()) {
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index fe339cb393f3d..0091ebd1ca773 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -86,29 +86,29 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP40:%.*]] = load float, ptr [[P_INVAR]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP40]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP40]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP41:%.*]] = shl i64 [[TMP39]], 2
; CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[P_INVAR]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <4 x float> poison, float [[TMP42]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT34:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT33]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <2 x float> poison, float [[TMP42]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT27]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP43:%.*]] = or disjoint i64 [[TMP41]], 3
; CHECK-NEXT: [[TMP44:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[TMP43]]
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr float, ptr [[TMP44]], i32 -3
-; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT34]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <8 x float> [[TMP46]], <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x float> [[TMP47]], <16 x float> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP45]], align 4
+; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT]], <2 x float> [[BROADCAST_SPLAT28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <4 x float> [[TMP46]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP47]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP45]], align 4
; CHECK-NEXT: [[TMP48:%.*]] = load float, ptr [[P_INVAR]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <4 x float> poison, float [[TMP48]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT35]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <2 x float> poison, float [[TMP48]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT29]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP49:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[TMP43]]
; CHECK-NEXT: [[TMP50:%.*]] = getelementptr float, ptr [[TMP49]], i32 -3
+; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT30]], <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLAT36]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INTERLEAVED_VEC37:%.*]] = shufflevector <16 x float> [[TMP52]], <16 x float> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC37]], ptr [[TMP50]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[INTERLEAVED_VEC31:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC31]], ptr [[TMP50]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
; CHECK-NEXT: br i1 [[TMP53]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
index 97e1c1c4362fc..7cbf0ab025206 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
@@ -3,7 +3,7 @@
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=AVX1
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
-; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mcpu=slm | FileCheck %s --check-prefix=SSE2
+; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mcpu=slm | FileCheck %s --check-prefix=SSE41
define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readonly %s1, ptr noalias nocapture readonly %s2, i32 %n) {
; SSE2-LABEL: @test_muladd(
More information about the llvm-commits
mailing list