[llvm] [NewGVN][3/3] Load coercion for loads that can be replaced by a phi (PR #68669)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 10 00:49:55 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Konstantina Mitropoulou (kmitropoulou)
<details>
<summary>Changes</summary>
[NewGVN][3/3] Load coercion for loads that can be replaced by a phi
In the following two examples, there are two cases where the load can be
replaced by a phi:
1. MemoryPhi: In Example 1, load %V is dependent on a MemoryPhi. This
indicates that there are two memory definitions in BB1 and BB2 for %V. As
a result, we replace the load with a phi.
Example 1:
```
Before load coercion
BB1: BB2:
1 = MemoryDef(liveOnEntry) 2 = MemoryDef(liveOnEntry)
store i32 100, ptr %P store i32 500, ptr %P
br label %BB3 br label %BB3
\ /
BB3:
3 = MemoryPhi({BB1,1},{BB2,2})
%V = load i32, ptr %P
After load coercion
BB1: BB2:
store i32 100, ptr %P store i32 500, ptr %P
br label %BB3 br label
\ /
BB3:
%V = phi i32 [ 100, %BB1 ], [ 500, %BB2 ]
```
2. Parial load elimination: In Example 2, %V1 and %V2 have live-on-entry
defintions and their memory locations overlap. By emitting, a new load
%V2' in BB2, we can replace %V2 with a phi node.
Example 2:
```
Before load coercion
BB1: BB2:
%V1 = load <2 x i32>, ptr %P br label %BB3
br label %BB3 /
\ /
BB3:
%V2 = load i32, ptr %P
After load coercion
BB1: BB2:
%V1 = load <2 x i32>, ptr %P %V2' = load i32, ptr %P
%0 = bitcast <2 x i32> %V1 to i64 br label %BB3
%1 = trunc i64 %0 to i32 /
br label %BB3 /
\ /
BB3:
%V2 = phi i32 [ %1, %BB1], [ %V2', %BB2 ]
```
The code includes more cases like these. Please refer to the examples in the
code comments for more details.
---
Patch is 196.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68669.diff
6 Files Affected:
- (modified) llvm/lib/Transforms/Scalar/NewGVN.cpp (+994-51)
- (added) llvm/test/Transforms/NewGVN/load_coercion_between_loads.ll (+424)
- (added) llvm/test/Transforms/NewGVN/load_coercion_between_store_and_load.ll (+341)
- (added) llvm/test/Transforms/NewGVN/load_coercion_replace_load_with_phi.ll (+3788)
- (modified) llvm/test/Transforms/NewGVN/pr14166-xfail.ll (-1)
- (modified) llvm/test/Transforms/NewGVN/pr35125.ll (+1-1)
``````````diff
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 19ac9526b5f88b6..ace3ffceb8fb953 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -73,9 +73,11 @@
#include "llvm/Analysis/CFGPrinter.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionPrecedenceTracking.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
@@ -106,6 +108,7 @@
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
#include "llvm/Transforms/Utils/VNCoercion.h"
#include <algorithm>
#include <cassert>
@@ -154,6 +157,10 @@ static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
cl::Hidden);
+// Enables load coercion for non-constant values.
+static cl::opt<bool> EnableLoadCoercion("enable-load-coercion", cl::init(true),
+ cl::Hidden);
+
//===----------------------------------------------------------------------===//
// GVN Pass
//===----------------------------------------------------------------------===//
@@ -495,6 +502,7 @@ class NewGVN {
AssumptionCache *AC = nullptr;
const DataLayout &DL;
std::unique_ptr<PredicateInfo> PredInfo;
+ ImplicitControlFlowTracking *ICF = nullptr;
// These are the only two things the create* functions should have
// side-effects on due to allocating memory.
@@ -653,6 +661,16 @@ class NewGVN {
// Deletion info.
SmallPtrSet<Instruction *, 8> InstructionsToErase;
+ // Map candidate load to their depending instructions.
+ mutable std::map<Value *, DenseSet<std::pair<Instruction *, BasicBlock *>>>
+ LoadCoercion;
+
+ // Keep newly generated loads.
+ SmallVector<Instruction *, 2> NewLoadsInLoadCoercion;
+
+ // Keep newly generated instructions.
+ SmallVector<Instruction *, 2> NewlyGeneratedInsns;
+
public:
NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
@@ -776,9 +794,9 @@ class NewGVN {
ExprResult checkExprResults(Expression *, Instruction *, Value *) const;
ExprResult performSymbolicEvaluation(Instruction *,
SmallPtrSetImpl<Value *> &) const;
- const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
- Instruction *,
- MemoryAccess *) const;
+ const Expression *createLoadExpAndUpdateMemUses(LoadInst *, Value *,
+ MemoryAccess *,
+ MemoryAccess *) const;
const Expression *performSymbolicLoadEvaluation(Instruction *) const;
const Expression *performSymbolicStoreEvaluation(Instruction *) const;
ExprResult performSymbolicCallEvaluation(Instruction *) const;
@@ -853,6 +871,7 @@ class NewGVN {
// Utilities.
void cleanupTables();
std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
+ unsigned updateDFSNumbers(unsigned);
void updateProcessedCount(const Value *V);
void verifyMemoryCongruency() const;
void verifyIterationSettled(Function &F);
@@ -893,6 +912,54 @@ class NewGVN {
// Debug counter info. When verifying, we have to reset the value numbering
// debug counter to the same state it started in to get the same results.
int64_t StartingVNCounter = 0;
+
+ // The following functions are used in load coercion:
+ // Try to add the load along with the depending instruction(s) in
+ // LoadCoercion map.
+ bool tryAddLoadDepInsnIntoLoadCoercionMap(LoadInst *, Instruction *,
+ BasicBlock *) const;
+ // Check if the candidate load can be optimized by another load which is also
+ // a live of entry definition and add it in LoadCoercion map.
+ bool findLiveOnEntryDependency(LoadInst *, LoadInst *, ArrayRef<BasicBlock *>,
+ bool) const;
+ // Collect the load instructions that can be optimized with load coercion.
+ // The filtering of the load instructions is based the type of their memory
+ // access.
+ bool performSymbolicLoadCoercionForNonConstantMemoryDef(LoadInst *,
+ StoreInst *,
+ MemoryAccess *) const;
+ const Expression *performSymbolicLoadCoercionForConstantMemoryDef(
+ Type *, Value *, LoadInst *, Instruction *, MemoryAccess *) const;
+ bool performSymbolicLoadCoercionForLiveOnEntryDef(LoadInst *,
+ MemoryAccess *) const;
+ bool performSymbolicLoadCoercionForMemoryPhi(LoadInst *,
+ MemoryAccess *) const;
+ // Code generation for load coercion. Replaces the load with the right
+ // instruction or the right sequence of instructions.
+ bool implementLoadCoercion();
+ // Update MemorySSA with the load instructions that are emitted during load
+ // coercion.
+ void updateMemorySSA(Instruction *, Instruction *);
+ // Extract the value that will replace the load from the depending
+ // instruction.
+ Value *getExtractedValue(LoadInst *, Instruction *);
+ // If load coercion is successful, the uses of the optimized load might need
+ // to be added to new congruence classes in order to optimize the code
+ // further. For this reason, we run value numbering for all the uses of the
+ // optimized load. If load coercion has failed, then we need to add the load
+ // (and its uses) to the right congruence class.
+ // Emit the phi that replaces the load and it updates the SSA with the new
+ // phi.
+ Value *emitLoadCoercionPhi(LoadInst *, BasicBlock *,
+ ArrayRef<std::pair<BasicBlock *, Instruction *>>);
+ // Check if the load can be replaced by a phi.
+ Value *tryReplaceLoadWithPhi(
+ LoadInst *, BasicBlock *,
+ SmallVectorImpl<std::pair<BasicBlock *, Instruction *>> &,
+ ArrayRef<BasicBlock *>);
+ void updateUsesAfterLoadCoercionImpl(LoadInst *,
+ SmallVectorImpl<Instruction *> &);
+ void updateUsesAfterLoadCoercion(LoadInst *, Value *);
};
} // end anonymous namespace
@@ -1439,12 +1506,380 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
return createStoreExpression(SI, StoreAccess);
}
+// A load can have one or more dependencies as the following examples show:
+//
+// Example 1:
+// BB1:
+// ...
+// store i32 %V1, ptr %P
+// ...
+// %V2 = load i32, ptr %P
+// ...
+//
+// Example 2:
+// BB1: BB2:
+// store i32 %V1, ptr %P %V2 = load i32, ptr %P
+// br label %BB3 br label %BB3
+// \ /
+// BB3:
+// %V3 = load i32, ptr %P
+//
+// In the first example, the load (%V2) has only one dependency. In the second
+// example, the load (%V3) has two dependencies. Therefore, we add the load
+// along with its two dependencies in LoadCoercion map. However, this is not
+// always the case as it is shown below:
+//
+// Example 3:
+// BB1:
+// %V1 = load <4 x i32>, ptr %P
+// br i1 %cond, label %BB2, label %BB3
+// / \
+// BB2: BB3:
+// %V2 = load <2 x i32>, ptr %P %V3 = load i32, ptr %P
+// br label %BB4 br label %BB4
+// \ /
+// BB4:
+// %V4 = load i32, ptr %P
+//
+// The %V4 load can be optimized by any of the loads (%V1, %V2, %V3). The loads
+// %V2 and %V3 can also be optimized by %V1. For this reason, we need to do an
+// extra check before we add the load in the map. Hence, we check if the load is
+// already in the map and if the existing depending instruction dominates the
+// current depending instruction. If so, then we do not add the new depending
+// instruction in LoadCoercion map. If the current depending instruction
+// dominates the existing depending instruction, then we remove the existing
+// depending instruction from LoadCoercion map and we add the current depending
+// instruction. In Example 3, the %V4 load has only one dependency (%V1) and we
+// add only this one in LoadCoercion map.
+bool NewGVN::tryAddLoadDepInsnIntoLoadCoercionMap(
+ LoadInst *LI, Instruction *CurrentDepI, BasicBlock *CurrentDepIBB) const {
+ // Can't forward from non-atomic to atomic without violating memory model.
+ if (LI->isAtomic() > CurrentDepI->isAtomic())
+ return false;
+
+ if (auto *DepLI = dyn_cast<LoadInst>(CurrentDepI))
+ if (LI->getAlign() < DepLI->getAlign())
+ return false;
+
+ if (auto *DepSI = dyn_cast<StoreInst>(CurrentDepI))
+ if (LI->getAlign() < DepSI->getAlign())
+ return false;
+
+ // Check if LI already exists in LoadCoercion map.
+ auto It = LoadCoercion.find(LI);
+ if (It != LoadCoercion.end()) {
+ auto &ExistingDepInsns = It->second;
+ // Iterate over all the existing depending instructions of LI.
+ for (auto &P : llvm::make_early_inc_range(ExistingDepInsns)) {
+ Instruction *ExistingDepI = P.first;
+ if (MSSAWalker->getClobberingMemoryAccess(getMemoryAccess(CurrentDepI)) ==
+ MSSAWalker->getClobberingMemoryAccess(
+ getMemoryAccess(ExistingDepI)) &&
+ isa<LoadInst>(ExistingDepI) && isa<LoadInst>(CurrentDepI)) {
+ // If the existing depending instruction dominates the current depending
+ // instruction, then we should not add the current depending instruction
+ // in LoadCoercion map (Example 3).
+ if (DT->dominates(ExistingDepI, CurrentDepI))
+ return true;
+
+ // If the current depending instruction dominates the existing one, then
+ // we remove the existing depending instruction from the LoadCoercion
+ // map. Next, we add the current depending instruction in LoadCoercion
+ // map.
+ if (DT->dominates(CurrentDepI, ExistingDepI))
+ ExistingDepInsns.erase(P);
+ }
+ }
+ }
+ // Add the load and the corresponding depending instruction in LoadCoercion
+ // map.
+ LoadCoercion[LI].insert(std::make_pair(CurrentDepI, CurrentDepIBB));
+ return true;
+}
+
+// Check if it is possible to apply load coercion between CandidateLI and
+// DependingLoad.
+bool NewGVN::findLiveOnEntryDependency(LoadInst *CandidateLI,
+ LoadInst *DependingLoad,
+ ArrayRef<BasicBlock *> DependingBlocks,
+ bool IsMemoryPhiDep) const {
+ int Offset = -1;
+
+ if (!DependingLoad || CandidateLI == DependingLoad ||
+ DependingLoad->getNumUses() == 0)
+ return false;
+
+ BasicBlock *DependingLoadBB = DependingLoad->getParent();
+ if (!ReachableBlocks.count(DependingLoadBB) ||
+ ICF->isDominatedByICFIFromSameBlock(CandidateLI))
+ return false;
+
+ if (InstructionsToErase.count(DependingLoad))
+ return false;
+
+ // We do not look deep in the CFG. We consider either instructions that
+ // dominate CandidateLI or instructions that are in one of the predecessors of
+ // CandidateLI.
+ if (DT->dominates(DependingLoad, CandidateLI))
+ Offset = analyzeLoadFromClobberingLoad(CandidateLI->getType(),
+ CandidateLI->getPointerOperand(),
+ DependingLoad, DL);
+ else {
+ BasicBlock *CandidateLIBB = CandidateLI->getParent();
+ auto It1 = llvm::find(DependingBlocks, CandidateLIBB);
+ auto It2 = llvm::find(DependingBlocks, DependingLoadBB);
+ auto Ite = DependingBlocks.end();
+ if (It1 == Ite && It2 != Ite && !isBackedge(DependingLoadBB, CandidateLIBB))
+ Offset = analyzeLoadFromClobberingLoad(CandidateLI->getType(),
+ CandidateLI->getPointerOperand(),
+ DependingLoad, DL);
+ }
+
+ bool IsLoadCoercionCandidate = false;
+ if (Offset >= 0) {
+ // If the candidate load depends on a MemoryPhi, then we do not consider the
+ // parent block of the depending instruction, but instead it is more
+ // convenient to consider the basic block of the MemoryPhi from which the
+ // value comes e.g.:
+ // BB1:
+ // %V1 = load i32, ptr %P
+ // br i1 %Cond, label %BB2, label %BB3
+ // / \
+ // BB2: BB3:
+ // store i32 100, ptr %P br label %BB4
+ // br label %BB4 /
+ // \ /
+ // BB4:
+ // %V2 = load i32, ptr %P
+ //
+ BasicBlock *BB = IsMemoryPhiDep ? DependingBlocks.back() : DependingLoadBB;
+ IsLoadCoercionCandidate |=
+ tryAddLoadDepInsnIntoLoadCoercionMap(CandidateLI, DependingLoad, BB);
+ }
+ return IsLoadCoercionCandidate;
+}
+
+// Process load instructions that have MemoryPhi dependencies.
+bool NewGVN::performSymbolicLoadCoercionForMemoryPhi(
+ LoadInst *LI, MemoryAccess *DefiningAccess) const {
+ assert((!LI || LI->isSimple()) && "Not a simple load");
+ bool IsLoadCoercionCandidate = false;
+ if (auto *MemPhi = dyn_cast<MemoryPhi>(DefiningAccess)) {
+ // If the candidate load is dominated by a call that never returns, then we
+ // do not replace the load with a phi node.
+ if (ICF->isDominatedByICFIFromSameBlock(LI))
+ return false;
+
+ // The MemoryPhi of Example 1 indicates that the load is dependent on the
+ // store (1) in Basic block T and store (2) in basic block F. Therefore,
+ // both of the store instructions should be added in LoadCoercion map.
+ //
+ // Example 1:
+ // BB1: BB2:
+ // 1 = MemoryDef(liveOnEntry) 2 = MemoryDef(liveOnEntry)
+ // store i32 100, ptr %P store i32 500, ptr %P
+ // br label %BB3 br label %BB3
+ // \ /
+ // BB3:
+ // 3 = MemoryPhi({BB1,1},{BB2,2})
+ // %V = load i32, ptr %P
+ //
+ // In Example 2, the load of BB3 has two dependencies: the store in BB1 as
+ // the MemoryPhi indicates and the load in BB2 which is not included in
+ // MemoryPhi. To find this dependency, we check if it is possible to apply
+ // load coercion to any of the instructions that have live on entry
+ // definition. We restrict our search to the MemoryPhi predecessors and the
+ // instructions that dominate the MemoryPhi.
+ //
+ // Example 2:
+ // BB1: BB2:
+ // 1 = MemoryDef(liveOnEntry) 0 = MemoryDef(liveOnEntry)
+ // store i32 100, ptr %P %V1 = load i32, ptr %P
+ // br label %BB3 br label %BB3
+ // \ /
+ // BB3:
+ // 2 = MemoryPhi({BB1,1},{BB2,liveOnEntry})
+ // %V2 = load i32, ptr %P
+ //
+ // Iterate over all the operands of the memory phi and check if any of its
+ // operands can optimize the current load.
+ SmallVector<std::pair<MemoryAccess *, BasicBlock *>, 1>
+ LiveOnEntryMemAccesses;
+ for (Use &Op : MemPhi->incoming_values()) {
+ // Bail out if one of the operands is not a memory use or definition.
+ // TODO: Add support for MemoryPhi operands.
+ if (!isa<MemoryUseOrDef>(&Op)) {
+ LoadCoercion.erase(LI);
+ return false;
+ }
+
+ MemoryUseOrDef *MemAccess = cast<MemoryUseOrDef>(&Op);
+ int Offset = -1;
+ Instruction *DepI = nullptr;
+ BasicBlock *IncomingBB = MemPhi->getIncomingBlock(Op);
+
+ // We collect the MemoryPhi operands that have live on entry definitions
+ // and we process them later only if it is possible to optimize LI with
+ // the MemoryDef operand. The search for the live on entry definitions is
+ // expensive and we need to do it only if it is necessary.
+ if (MSSA->isLiveOnEntryDef(MemAccess))
+ LiveOnEntryMemAccesses.push_back(std::make_pair(MemAccess, IncomingBB));
+ else if (isa<MemoryDef>(&Op)) {
+ // Process MemoryDef operands.
+ DepI = MemAccess->getMemoryInst();
+ Offset = -1;
+
+ if (!ReachableBlocks.count(DepI->getParent())) {
+ LoadCoercion.erase(LI);
+ return false;
+ }
+
+ if (DT->dominates(LI, DepI)) {
+ // In this case, there is a loop. For now, we bail-out load
+ // coercion.
+ LoadCoercion.erase(LI);
+ return false;
+ }
+
+ if (auto *DepS = dyn_cast<StoreInst>(DepI))
+ Offset = analyzeLoadFromClobberingStore(
+ LI->getType(), LI->getPointerOperand(), DepS, DL);
+ else if (auto *DepL = dyn_cast<LoadInst>(DepI))
+ Offset = analyzeLoadFromClobberingLoad(
+ LI->getType(), LI->getPointerOperand(), DepL, DL);
+ else if (auto *DepCall = dyn_cast<CallInst>(DepI)) {
+ // TODO: Improve call coverage.
+ if (AA->doesNotAccessMemory(DepCall) || AA->onlyReadsMemory(DepCall))
+ continue;
+ LoadCoercion.erase(LI);
+ return false;
+ } else {
+ LoadCoercion.erase(LI);
+ return false;
+ }
+ if (Offset >= 0)
+ IsLoadCoercionCandidate |=
+ tryAddLoadDepInsnIntoLoadCoercionMap(LI, DepI, IncomingBB);
+ else {
+ LoadCoercion.erase(LI);
+ return false;
+ }
+ }
+ }
+
+ if (IsLoadCoercionCandidate) {
+ // Process the operands with live on entry definitions.
+ for (auto P : LiveOnEntryMemAccesses) {
+ MemoryAccess *MemAccess = P.first;
+ int Offset;
+ for (const auto &U : MemAccess->uses()) {
+ Offset = -1;
+ auto *MemUse = dyn_cast<MemoryUse>(U.getUser());
+ if (MemUse == nullptr)
+ continue;
+ LoadInst *DependingLoad = dyn_cast<LoadInst>(MemUse->getMemoryInst());
+ if (!DependingLoad)
+ continue;
+ SmallVector<BasicBlock *, 1> IncomingBB;
+ IncomingBB.push_back(P.second);
+ findLiveOnEntryDependency(LI, DependingLoad, IncomingBB, true);
+ }
+ }
+ }
+ }
+ return IsLoadCoercionCandidate;
+}
+
+// Find load coercion opportunities between instructions with live on entry
+// definitions.
+bool NewGVN::performSymbolicLoadCoercionForLiveOnEntryDef(
+ LoadInst *LI, MemoryAccess *DefiningAccess) const {
+ bool IsLoadCoercionCandidate = false;
+ for (const auto &U : MSSA->getLiveOnEntryDef()->uses()) {
+ if (auto *MemUse = dyn_cast<MemoryUse>(U.getUser())) {
+ // TODO: Add support for calls.
+ LoadInst *DependingLoad = dyn_cast<LoadInst>(MemUse->getMemoryInst());
+ if (!DependingLoad || LI == DependingLoad)
+ continue;
+
+ // If the two instructions have the same type, then there is a load
+ // coercion opportunity only if the LI and the DependingLoad are in
+ // different basic blocks and the basic block of the DependingLoad is one
+ // of the predecessors of the basic block of the LI. For any other case,
+ // the LI will be eliminated by adding the two loads in the same
+ // congruence class.
+ //
+ // Example 1: Here, we do not need to apply load coercion. The two load
+ // will be added in the same congruence class and %V2 will be eliminated.
+ //
+ // BB1:
+ // ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/68669
More information about the llvm-commits
mailing list