[llvm] db5f7dc - Revert "[SLP]Support LShr as base for copyable elements"
Alex Bradbury via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 14 14:24:40 PDT 2025
Author: Alex Bradbury
Date: 2025-08-14T22:18:24+01:00
New Revision: db5f7dc374fdd70d39857d3402d42878139cbb4e
URL: https://github.com/llvm/llvm-project/commit/db5f7dc374fdd70d39857d3402d42878139cbb4e
DIFF: https://github.com/llvm/llvm-project/commit/db5f7dc374fdd70d39857d3402d42878139cbb4e.diff
LOG: Revert "[SLP]Support LShr as base for copyable elements"
This reverts commit ca4ebf95172d24f8c47655709b2c9eb85bda5cb2.
Causes compile-time crashes for some inputs with RVV zvl512b/zvl1024b
configurations. See here for a minimal reproducer:
https://github.com/llvm/llvm-project/pull/153393#issuecomment-3189898813
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7362d5b0b5865..c35a7552b4058 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10564,12 +10564,6 @@ class InstructionsCompatibilityAnalysis {
unsigned MainOpcode = 0;
Instruction *MainOp = nullptr;
- /// Checks if the opcode is supported as the main opcode for copyable
- /// elements.
- static bool isSupportedOpcode(const unsigned Opcode) {
- return Opcode == Instruction::Add || Opcode == Instruction::LShr;
- }
-
/// Identifies the best candidate value, which represents main opcode
/// operation.
/// Currently the best candidate is the Add instruction with the parent
@@ -10577,28 +10571,28 @@ class InstructionsCompatibilityAnalysis {
void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
BasicBlock *Parent = nullptr;
// Checks if the instruction has supported opcode.
- auto IsSupportedInstruction = [&](Instruction *I) {
- return I && isSupportedOpcode(I->getOpcode()) &&
+ auto IsSupportedOpcode = [&](Instruction *I) {
+ return I && I->getOpcode() == Instruction::Add &&
(!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
};
// Exclude operands instructions immediately to improve compile time, it
// will be unable to schedule anyway.
SmallDenseSet<Value *, 8> Operands;
- SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
for (Value *V : VL) {
auto *I = dyn_cast<Instruction>(V);
if (!I)
continue;
if (!DT.isReachableFromEntry(I->getParent()))
continue;
- if (Candidates.empty()) {
- Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
+ if (!MainOp) {
+ MainOp = I;
Parent = I->getParent();
Operands.insert(I->op_begin(), I->op_end());
continue;
}
if (Parent == I->getParent()) {
- Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
+ if (!IsSupportedOpcode(MainOp) && !Operands.contains(I))
+ MainOp = I;
Operands.insert(I->op_begin(), I->op_end());
continue;
}
@@ -10610,35 +10604,24 @@ class InstructionsCompatibilityAnalysis {
(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
"Different nodes should have
diff erent DFS numbers");
if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
- Candidates.clear();
- Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
+ MainOp = I;
Parent = I->getParent();
Operands.clear();
Operands.insert(I->op_begin(), I->op_end());
}
}
- unsigned BestOpcodeNum = 0;
- MainOp = nullptr;
- for (const auto &P : Candidates) {
- if (P.second.size() < BestOpcodeNum)
- continue;
- for (Instruction *I : P.second) {
- if (IsSupportedInstruction(I) && !Operands.contains(I)) {
- MainOp = I;
- BestOpcodeNum = P.second.size();
- break;
- }
- }
+ if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
+ MainOp = nullptr;
+ return;
}
- if (MainOp)
- MainOpcode = MainOp->getOpcode();
+ MainOpcode = MainOp->getOpcode();
}
/// Returns the idempotent value for the \p MainOp with the detected \p
/// MainOpcode. For Add, returns 0. For Or, it should choose between false and
/// the operand itself, since V or V == V.
Value *selectBestIdempotentValue() const {
- assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
+ assert(MainOpcode == Instruction::Add && "Unsupported opcode");
return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
!MainOp->isCommutative());
}
@@ -10651,8 +10634,13 @@ class InstructionsCompatibilityAnalysis {
return {V, V};
if (!S.isCopyableElement(V))
return convertTo(cast<Instruction>(V), S).second;
- assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
- return {V, selectBestIdempotentValue()};
+ switch (MainOpcode) {
+ case Instruction::Add:
+ return {V, selectBestIdempotentValue()};
+ default:
+ break;
+ }
+ llvm_unreachable("Unsupported opcode");
}
/// Builds operands for the original instructions.
@@ -10865,21 +10853,6 @@ class InstructionsCompatibilityAnalysis {
}
if (!Res)
return InstructionsState::invalid();
- constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
- InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
- InstructionCost VectorCost;
- FixedVectorType *VecTy =
- getWidenedType(S.getMainOp()->getType(), VL.size());
- switch (MainOpcode) {
- case Instruction::Add:
- case Instruction::LShr:
- VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
- break;
- default:
- llvm_unreachable("Unexpected instruction.");
- }
- if (VectorCost > ScalarCost)
- return InstructionsState::invalid();
return S;
}
assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -21117,7 +21090,6 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
const auto *It = find(Op, CD->getInst());
assert(It != Op.end() && "Lane not set");
- SmallPtrSet<Instruction *, 4> Visited;
do {
int Lane = std::distance(Op.begin(), It);
assert(Lane >= 0 && "Lane not set");
@@ -21139,15 +21111,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
(InsertInReadyList && UseSD->isReady()))
WorkList.push_back(UseSD);
}
- } else if (Visited.insert(In).second) {
- if (ScheduleData *UseSD = getScheduleData(In)) {
- CD->incDependencies();
- if (!UseSD->isScheduled())
- CD->incrementUnscheduledDeps(1);
- if (!UseSD->hasValidDependencies() ||
- (InsertInReadyList && UseSD->isReady()))
- WorkList.push_back(UseSD);
- }
+ } else if (ScheduleData *UseSD = getScheduleData(In)) {
+ CD->incDependencies();
+ if (!UseSD->isScheduled())
+ CD->incrementUnscheduledDeps(1);
+ if (!UseSD->hasValidDependencies() ||
+ (InsertInReadyList && UseSD->isReady()))
+ WorkList.push_back(UseSD);
}
It = find(make_range(std::next(It), Op.end()), CD->getInst());
} while (It != Op.end());
@@ -21905,11 +21875,9 @@ bool BoUpSLP::collectValuesToDemote(
return all_of(E.Scalars, [&](Value *V) {
if (isa<PoisonValue>(V))
return true;
- APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
- if (E.isCopyableElement(V))
- return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
auto *I = cast<Instruction>(V);
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
+ APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
return AmtKnownBits.getMaxValue().ult(BitWidth) &&
MaskedValueIsZero(I->getOperand(0), ShiftedBits,
SimplifyQuery(*DL));
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
index 6d961fc3378b4..8d44d03e0e5cc 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll
@@ -8,8 +8,11 @@ define i32 @test(ptr %c) {
; CHECK-NEXT: [[BITLEN:%.*]] = getelementptr i8, ptr [[C]], i64 136
; CHECK-NEXT: [[INCDEC_PTR_3_1:%.*]] = getelementptr i8, ptr [[C]], i64 115
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[BITLEN]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
-; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
+; CHECK-NEXT: [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 0, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 4, i32 5>
; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8>
; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1
; CHECK-NEXT: ret i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
index c02ef8388b066..4f94784a24dd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -101,8 +101,10 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
; CHECK-LABEL: @PR16739_byval(
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
+; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
+; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
; CHECK-NEXT: ret <4 x float> [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index 0545e5403f594..700e3ed9effc4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -101,8 +101,10 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
; CHECK-LABEL: @PR16739_byval(
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
+; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
+; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
; CHECK-NEXT: ret <4 x float> [[TMP5]]
More information about the llvm-commits
mailing list