[llvm] 39bab1d - [SLP]Check if the operand for removal is the reduction operand, awaiting for the reduction
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 26 14:18:28 PST 2025
Author: Alexey Bataev
Date: 2025-02-26T14:17:11-08:00
New Revision: 39bab1de33333ee3c62b586c4e8d26f8c443bc60
URL: https://github.com/llvm/llvm-project/commit/39bab1de33333ee3c62b586c4e8d26f8c443bc60
DIFF: https://github.com/llvm/llvm-project/commit/39bab1de33333ee3c62b586c4e8d26f8c443bc60.diff
LOG: [SLP]Check if the operand for removal is the reduction operand, awaiting for the reduction
If the operand of the instruction-to-be-removed is a reduction value,
which is not reduced yet, and, thus, it has no users, it may be removed
during operands analysis.
Fixes #128736
Added:
llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b25b09306aca8..e8c91ebd508ce 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1389,9 +1389,10 @@ class BoUpSLP {
/// Vectorize the tree but with the list of externally used values \p
/// ExternallyUsedValues. Values in this MapVector can be replaced but the
/// generated extractvalue instructions.
- Value *
- vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
- Instruction *ReductionRoot = nullptr);
+ Value *vectorizeTree(
+ const ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ Instruction *ReductionRoot = nullptr,
+ ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
@@ -2849,11 +2850,13 @@ class BoUpSLP {
/// Remove instructions from the parent function and clear the operands of \p
/// DeadVals instructions, marking for deletion trivially dead operands.
template <typename T>
- void removeInstructionsAndOperands(ArrayRef<T *> DeadVals) {
+ void removeInstructionsAndOperands(
+ ArrayRef<T *> DeadVals,
+ ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
SmallVector<WeakTrackingVH> DeadInsts;
for (T *V : DeadVals) {
auto *I = cast<Instruction>(V);
- DeletedInstructions.insert(I);
+ eraseInstruction(I);
}
DenseSet<Value *> Processed;
for (T *V : DeadVals) {
@@ -2915,12 +2918,17 @@ class BoUpSLP {
// loop iteration.
if (auto *OpI = dyn_cast<Instruction>(OpV))
if (!DeletedInstructions.contains(OpI) &&
+ (!OpI->getType()->isVectorTy() ||
+ none_of(VectorValuesAndScales,
+ [&](const std::tuple<Value *, unsigned, bool> &V) {
+ return std::get<0>(V) == OpI;
+ })) &&
isInstructionTriviallyDead(OpI, TLI))
DeadInsts.push_back(OpI);
}
VI->removeFromParent();
- DeletedInstructions.insert(VI);
+ eraseInstruction(VI);
SE->forgetValue(VI);
}
}
@@ -16466,9 +16474,10 @@ Value *BoUpSLP::vectorizeTree() {
return vectorizeTree(ExternallyUsedValues);
}
-Value *
-BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
- Instruction *ReductionRoot) {
+Value *BoUpSLP::vectorizeTree(
+ const ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ Instruction *ReductionRoot,
+ ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
// All blocks must be scheduled before any instructions are inserted.
for (auto &BSIter : BlocksSchedules) {
scheduleBlock(BSIter.second.get());
@@ -17075,7 +17084,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
// cache correctness.
// NOTE: removeInstructionAndOperands only marks the instruction for deletion
// - instructions are not deleted until later.
- removeInstructionsAndOperands(ArrayRef(RemovedInsts));
+ removeInstructionsAndOperands(ArrayRef(RemovedInsts), VectorValuesAndScales);
Builder.ClearInsertionPoint();
InstrElementSize.clear();
@@ -20449,8 +20458,8 @@ class HorizontalReduction {
InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
// Vectorize a tree.
- Value *VectorizedRoot =
- V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
+ Value *VectorizedRoot = V.vectorizeTree(
+ LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
// Update TrackedToOrig mapping, since the tracked values might be
// updated.
for (Value *RdxVal : Candidates) {
@@ -20678,7 +20687,7 @@ class HorizontalReduction {
Ignore->replaceAllUsesWith(P);
}
}
- V.removeInstructionsAndOperands(RdxOps);
+ V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
}
} else if (!CheckForReusedReductionOps) {
for (ReductionOpsType &RdxOps : ReductionOps)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll
new file mode 100644
index 0000000000000..799533824c5aa
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=znver2 < %s | FileCheck %s
+
+define i32 @test(i32 %arg) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[BB:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 0, i32 0>, i32 [[ARG]], i32 1
+; CHECK-NEXT: br label %[[BB1:.*]]
+; CHECK: [[BB1]]:
+; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[OP_RDX:%.*]], %[[BB1]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> <i64 0, i64 0, i64 poison, i64 poison>, <2 x i64> zeroinitializer, i64 2)
+; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i64> zeroinitializer, [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i64> [[TMP2]] to <4 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> zeroinitializer, [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP0]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP9:%.*]] = mul <2 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP7]], i64 0)
+; CHECK-NEXT: [[RDX_OP:%.*]] = mul <4 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP7]], <4 x i32> [[RDX_OP]], i64 0)
+; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP12]])
+; CHECK-NEXT: [[OP_RDX]] = mul i32 0, [[TMP13]]
+; CHECK-NEXT: br label %[[BB1]]
+;
+bb:
+ br label %bb1
+
+bb1:
+ %phi = phi i32 [ 0, %bb ], [ %mul37, %bb1 ]
+ %mul = mul i64 0, 0
+ %trunc = trunc i64 %mul to i32
+ %or = or i32 0, %trunc
+ %or2 = or i32 0, %or
+ %or3 = or i32 %or2, 0
+ %mul4 = mul i32 0, %or3
+ %mul5 = mul i32 %or3, 0
+ %mul6 = mul i32 %mul5, %mul4
+ %mul7 = mul i32 %mul6, %mul4
+ %mul8 = mul i32 %mul7, %or3
+ %mul9 = mul i64 0, 0
+ %trunc10 = trunc i64 %mul9 to i32
+ %or11 = or i32 0, %trunc10
+ %or12 = or i32 %arg, %or11
+ %or13 = or i32 %or12, 0
+ %mul14 = mul i32 %or13, %mul8
+ %mul15 = mul i32 %mul14, 0
+ %mul16 = mul i32 %mul15, 0
+ %mul17 = mul i32 %mul16, %or13
+ %shl = shl i64 0, 0
+ %mul18 = mul i64 %shl, 0
+ %trunc19 = trunc i64 %mul18 to i32
+ %or20 = or i32 0, %trunc19
+ %or21 = or i32 0, %or20
+ %or22 = or i32 %or21, 0
+ %mul23 = mul i32 %or22, %mul17
+ %mul24 = mul i32 %mul23, 0
+ %mul25 = mul i32 %mul24, 0
+ %mul26 = mul i32 %mul25, %or22
+ %shl27 = shl i64 0, 0
+ %mul28 = mul i64 %shl27, 0
+ %trunc29 = trunc i64 %mul28 to i32
+ %or30 = or i32 0, %trunc29
+ %or31 = or i32 0, %or30
+ %or32 = or i32 %or31, 0
+ %mul33 = mul i32 0, %or32
+ %mul34 = mul i32 %or32, %mul26
+ %mul35 = mul i32 %mul34, %mul33
+ %mul36 = mul i32 %mul35, %mul33
+ %mul37 = mul i32 %mul36, %or32
+ br label %bb1
+}
More information about the llvm-commits
mailing list