[llvm] 5d2b3de - [SLP] Avoid std::stable_sort(properlyDominates()).
Harald van Dijk via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 3 09:52:10 PDT 2021
Author: Harald van Dijk
Date: 2021-06-03T17:51:52+01:00
New Revision: 5d2b3de284f46e4c1e60fc5f266b085f54391566
URL: https://github.com/llvm/llvm-project/commit/5d2b3de284f46e4c1e60fc5f266b085f54391566
DIFF: https://github.com/llvm/llvm-project/commit/5d2b3de284f46e4c1e60fc5f266b085f54391566.diff
LOG: [SLP] Avoid std::stable_sort(properlyDominates()).
As noticed by NAKAMURA Takumi back in 2017, we cannot use
properlyDominates for std::stable_sort as properlyDominates only
partially orders blocks. That is, for blocks A, B, C, D, where A
dominates B and C dominates D, we have A == C, B == C, but A < B. This
is not a valid comparison function for std::stable_sort and causes
different results between libstdc++ and libc++. This change uses DFS
numbering to give deterministic results for all reachable blocks.
Unreachable blocks are ignored already, so do not need special
consideration.
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D103441
Added:
llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 90f76c98e8aae..58016f03bcc24 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4319,8 +4319,16 @@ InstructionCost BoUpSLP::getSpillCost() const {
continue;
OrderedScalars.push_back(Inst);
}
- llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) {
- return DT->dominates(B, A);
+ llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
+ auto *NodeA = DT->getNode(A->getParent());
+ auto *NodeB = DT->getNode(B->getParent());
+ assert(NodeA && "Should only process reachable instructions");
+ assert(NodeB && "Should only process reachable instructions");
+ assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
+ "Different nodes should have
diff erent DFS numbers");
+ if (NodeA != NodeB)
+ return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
+ return B->comesBefore(A);
});
for (Instruction *Inst : OrderedScalars) {
@@ -5711,10 +5719,11 @@ void BoUpSLP::optimizeGatherSequence() {
// Sort blocks by domination. This ensures we visit a block after all blocks
// dominating it are visited.
- llvm::stable_sort(CSEWorkList,
- [this](const DomTreeNode *A, const DomTreeNode *B) {
- return DT->properlyDominates(A, B);
- });
+ llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
+ assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
+ "Different nodes should have
diff erent DFS numbers");
+ return A->getDFSNumIn() < B->getDFSNumIn();
+ });
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
@@ -6614,6 +6623,9 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.
+ // Update DFS numbers now so that we can use them for ordering.
+ DT->updateDFSNumbers();
+
// Scan the blocks in the function in post order.
for (auto BB : post_order(&F.getEntryBlock())) {
collectSeedInstructions(BB);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll
new file mode 100644
index 0000000000000..520421a9784b2
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ordering-bug.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=i386-linux-gnu -slp-vectorizer -S %s | FileCheck %s
+
+%struct.a = type { [2 x i64] }
+
+ at a = external global %struct.a
+ at b = external global %struct.a
+ at c = external global %struct.a
+
+define void @f(i1 %x) #0 {
+; CHECK-LABEL: @f(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @a to <2 x i64>*), align 8
+; CHECK-NEXT: br i1 [[X:%.*]], label [[WHILE_BODY_LR_PH:%.*]], label [[WHILE_END:%.*]]
+; CHECK: while.body.lr.ph:
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+; CHECK-NEXT: [[ICMP_A1:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @b to <2 x i64>*), align 8
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i1> poison, i1 [[ICMP_A1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i1> [[TMP3]], i1 [[ICMP_A1]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i64> [[TMP2]], <2 x i64> [[TMP0]]
+; CHECK-NEXT: br label [[WHILE_END]]
+; CHECK: while.end:
+; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i64> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP5]], [[WHILE_BODY_LR_PH]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (%struct.a* @c to <2 x i64>*), align 8
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT: [[ICMP_D0:%.*]] = icmp eq i64 [[TMP8]], 0
+; CHECK-NEXT: br i1 [[ICMP_D0]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[AND0_TMP:%.*]] = and i64 [[TMP8]], 8
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> poison, i64 [[AND0_TMP]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> [[TMP10]], i64 [[TMP9]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i64> [[TMP11]], [[TMP7]]
+; CHECK-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (%struct.a* @a to <2 x i64>*), align 8
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %a0 = load i64, i64* getelementptr inbounds (%struct.a, %struct.a* @a, i32 0, i32 0, i32 0), align 8
+ %a1 = load i64, i64* getelementptr inbounds (%struct.a, %struct.a* @a, i32 0, i32 0, i32 1), align 8
+ br i1 %x, label %while.body.lr.ph, label %while.end
+
+while.body.lr.ph:
+ %icmp.a1 = icmp eq i64 %a1, 0
+ %b0 = load i64, i64* getelementptr inbounds (%struct.a, %struct.a* @b, i32 0, i32 0, i32 0), align 8
+ %b1 = load i64, i64* getelementptr inbounds (%struct.a, %struct.a* @b, i32 0, i32 0, i32 1), align 8
+ %c0 = select i1 %icmp.a1, i64 %b0, i64 %a0
+ %c1 = select i1 %icmp.a1, i64 %b1, i64 %a1
+ br label %while.end
+
+while.end:
+ %d0 = phi i64 [ %a0, %entry ], [ %c0, %while.body.lr.ph ]
+ %d1 = phi i64 [ %a1, %entry ], [ %c1, %while.body.lr.ph ]
+ %e0 = load i64, i64* getelementptr inbounds (%struct.a, %struct.a* @c, i32 0, i32 0, i32 0), align 8
+ %e1 = load i64, i64* getelementptr inbounds (%struct.a, %struct.a* @c, i32 0, i32 0, i32 1), align 8
+ %icmp.d0 = icmp eq i64 %d0, 0
+ br i1 %icmp.d0, label %if.end, label %if.then
+
+if.then:
+ %and0.tmp = and i64 %d0, 8
+ %and0 = and i64 %and0.tmp, %e0
+ %and1 = and i64 %e1, %d1
+ store i64 %and0, i64* getelementptr inbounds (%struct.a, %struct.a* @a, i32 0, i32 0, i32 0), align 8
+ store i64 %and1, i64* getelementptr inbounds (%struct.a, %struct.a* @a, i32 0, i32 0, i32 1), align 8
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+attributes #0 = { "target-features"="+sse2" }
More information about the llvm-commits
mailing list