[llvm] f171149 - [SimpifyCFG] Speculate a store preceded by a local non-escaping load
Momchil Velikov via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 5 07:54:48 PDT 2021
Author: Momchil Velikov
Date: 2021-08-05T15:54:42+01:00
New Revision: f171149e0d541ca7da7af5fe59bd6d9a77267d24
URL: https://github.com/llvm/llvm-project/commit/f171149e0d541ca7da7af5fe59bd6d9a77267d24
DIFF: https://github.com/llvm/llvm-project/commit/f171149e0d541ca7da7af5fe59bd6d9a77267d24.diff
LOG: [SimpifyCFG] Speculate a store preceded by a local non-escaping load
In SimplifyCFG we may simplify the CFG by speculatively executing
certain stores, when they are preceded by a store to the same
location. This patch allows such speculation also when the stores are
similarly preceded by a load.
In order for this transformation to be correct we need to ensure that
the memory location is writable and the store in the new location does
not introduce a data race.
Local objects (created by an `alloca` instruction) are always
writable, so once we are past a read from a location it is valid to
also write to that same location.
Seeing just a load does not guarantee absence of a data race (unlike
if we see a store) - the load may still be part of a race, just not
causing undefined behaviour
(cf. https://llvm.org/docs/Atomics.html#optimization-outside-atomic).
In the original program, a data race might have been prevented by the
condition, but once we move the store outside the condition, we must
be sure a data race wasn't possible anyway, no matter what the
condition evaluates to.
One way to be sure that a local object is never concurrently
read/written is check that its address never escapes the function.
Hence this transformation is restricted to local, non-escaping
objects.
Reviewed By: nikic, lebedev.ri
Differential Revision: https://reviews.llvm.org/D107281
Added:
Modified:
llvm/lib/Transforms/Utils/SimplifyCFG.cpp
llvm/test/Transforms/SimplifyCFG/speculate-store.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 0e392849150e5..a72af2bbe0b88 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -25,6 +25,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/GuardUtils.h"
@@ -2250,6 +2251,23 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
return SI->getValueOperand();
return nullptr; // Unknown store.
}
+
+ if (auto *LI = dyn_cast<LoadInst>(&CurI)) {
+ if (LI->getPointerOperand() == StorePtr && LI->getType() == StoreTy &&
+ LI->isSimple()) {
+ // Local objects (created by an `alloca` instruction) are always
+ // writable, so once we are past a read from a location it is valid to
+ // also write to that same location.
+ // If the address of the local object never escapes the function, that
+ // means it's never concurrently read or written, hence moving the store
+ // from under the condition will not introduce a data race.
+ auto *AI = dyn_cast<AllocaInst>(getUnderlyingObject(StorePtr));
+ if (AI && !PointerMayBeCaptured(AI, false, true))
+ // Found a previous load, return it.
+ return LI;
+ }
+ // The load didn't work out, but we may still find a store.
+ }
}
return nullptr;
diff --git a/llvm/test/Transforms/SimplifyCFG/speculate-store.ll b/llvm/test/Transforms/SimplifyCFG/speculate-store.ll
index 8ceba7df8cbb5..0e447866eff65 100644
--- a/llvm/test/Transforms/SimplifyCFG/speculate-store.ll
+++ b/llvm/test/Transforms/SimplifyCFG/speculate-store.ll
@@ -175,6 +175,145 @@ ret.end:
ret void
}
+;; Speculate a store, preceded by a local, non-escaping load
+define i32 @load_before_store_noescape(i64 %i, i32 %b) {
+; CHECK-LABEL: @load_before_store_noescape(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A:%.*]] = alloca [2 x i32], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast [2 x i32]* [[A]] to i64*
+; CHECK-NEXT: store i64 4294967296, i64* [[TMP0]], align 8
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 [[I:%.*]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT: [[SPEC_STORE_SELECT:%.*]] = select i1 [[CMP]], i32 [[B]], i32 [[TMP1]]
+; CHECK-NEXT: store i32 [[SPEC_STORE_SELECT]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+entry:
+ %a = alloca [2 x i32], align 8
+ %0 = bitcast [2 x i32]* %a to i64*
+ store i64 4294967296, i64* %0, align 8
+ %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 %i
+ %1 = load i32, i32* %arrayidx, align 4
+ %cmp = icmp slt i32 %1, %b
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ store i32 %b, i32* %arrayidx, align 4
+ br label %if.end
+
+if.end:
+ %arrayidx1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0
+ %2 = load i32, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1
+ %3 = load i32, i32* %arrayidx2, align 4
+ %add = add nsw i32 %2, %3
+ ret i32 %add
+}
+
+;; Don't speculate a store, preceded by a local, escaping load
+define i32 @load_before_store_escape(i64 %i, i32 %b) {
+; CHECK-LABEL: @load_before_store_escape(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A:%.*]] = alloca [2 x i32], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast [2 x i32]* [[A]] to i64*
+; CHECK-NEXT: store i64 4294967296, i64* [[TMP0]], align 8
+; CHECK-NEXT: call void @fork_some_threads([2 x i32]* [[A]])
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 [[I:%.*]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: store i32 [[B]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: call void @join_some_threads()
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+entry:
+ %a = alloca [2 x i32], align 8
+ %0 = bitcast [2 x i32]* %a to i64*
+ store i64 4294967296, i64* %0, align 8
+ call void @fork_some_threads([2 x i32]* %a)
+ %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 %i
+ %1 = load i32, i32* %arrayidx, align 4
+ %cmp = icmp slt i32 %1, %b
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ store i32 %b, i32* %arrayidx, align 4
+ br label %if.end
+
+if.end:
+ %arrayidx1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0
+ %2 = load i32, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1
+ %3 = load i32, i32* %arrayidx2, align 4
+ %add = add nsw i32 %2, %3
+ call void @join_some_threads()
+ ret i32 %add
+}
+
+declare void @fork_some_threads([2 x i32] *);
+declare void @join_some_threads();
+
+; Don't speculate if it's not the only instruction in the block (not counting
+; the terminator)
+define i32 @not_alone_in_block(i64 %i, i32 %b) {
+; CHECK-LABEL: @not_alone_in_block(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[A:%.*]] = alloca [2 x i32], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast [2 x i32]* [[A]] to i64*
+; CHECK-NEXT: store i64 4294967296, i64* [[TMP0]], align 8
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 [[I:%.*]]
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: store i32 [[B]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT: store i32 [[B]], i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+entry:
+ %a = alloca [2 x i32], align 8
+ %0 = bitcast [2 x i32]* %a to i64*
+ store i64 4294967296, i64* %0, align 8
+ %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 %i
+ %arrayidx1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0
+ %1 = load i32, i32* %arrayidx, align 4
+ %cmp = icmp slt i32 %1, %b
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ store i32 %b, i32* %arrayidx, align 4
+ store i32 %b, i32* %arrayidx1, align 4
+ br label %if.end
+
+if.end:
+ %2 = load i32, i32* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1
+ %3 = load i32, i32* %arrayidx2, align 4
+ %add = add nsw i32 %2, %3
+ ret i32 %add
+}
+
; CHECK: !0 = !{!"branch_weights", i32 3, i32 5}
!0 = !{!"branch_weights", i32 3, i32 5}
More information about the llvm-commits
mailing list