[llvm] 2d1e8a0 - [EarlyCSE] Compare GEP instructions based on offset (#65875)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 19 15:14:49 PDT 2023
Author: DianQK
Date: 2023-09-20T06:14:45+08:00
New Revision: 2d1e8a03f5eeff48cd7928d003fc12f728b2c7cf
URL: https://github.com/llvm/llvm-project/commit/2d1e8a03f5eeff48cd7928d003fc12f728b2c7cf
DIFF: https://github.com/llvm/llvm-project/commit/2d1e8a03f5eeff48cd7928d003fc12f728b2c7cf.diff
LOG: [EarlyCSE] Compare GEP instructions based on offset (#65875)
Closes #65763.
This will provide more opportunities for constant propagation for
subsequent optimizations.
Added:
llvm/test/Transforms/EarlyCSE/gep.ll
llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll
Modified:
llvm/lib/Transforms/Scalar/EarlyCSE.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 439235f47471efb..4990fa9f8b5ea36 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -67,6 +67,7 @@ STATISTIC(NumCSE, "Number of instructions CSE'd");
STATISTIC(NumCSECVP, "Number of compare instructions CVP'd");
STATISTIC(NumCSELoad, "Number of load instructions CSE'd");
STATISTIC(NumCSECall, "Number of call instructions CSE'd");
+STATISTIC(NumCSEGEP, "Number of GEP instructions CSE'd");
STATISTIC(NumDSE, "Number of trivial dead stores removed");
DEBUG_COUNTER(CSECounter, "early-cse",
@@ -143,11 +144,11 @@ struct SimpleValue {
!CI->getFunction()->isPresplitCoroutine();
}
return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
- isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
- isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
- isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
- isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
- isa<InsertValueInst>(Inst) || isa<FreezeInst>(Inst);
+ isa<BinaryOperator>(Inst) || isa<CmpInst>(Inst) ||
+ isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+ isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+ isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst) ||
+ isa<FreezeInst>(Inst);
}
};
@@ -307,10 +308,9 @@ static unsigned getHashValueImpl(SimpleValue Val) {
IVI->getOperand(1),
hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
- assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
- isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
- isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst) ||
- isa<FreezeInst>(Inst)) &&
+ assert((isa<CallInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+ isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+ isa<UnaryOperator>(Inst) || isa<FreezeInst>(Inst)) &&
"Invalid/unknown instruction");
// Handle intrinsics with commutative operands.
@@ -548,11 +548,81 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
// currently executing, so conservatively return false if they are in
//
diff erent basic blocks.
if (LHSI->isConvergent() && LHSI->getParent() != RHSI->getParent())
- return false;
+ return false;
return LHSI->isIdenticalTo(RHSI);
}
+//===----------------------------------------------------------------------===//
+// GEPValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct GEPValue {
+ Instruction *Inst;
+ std::optional<int64_t> ConstantOffset;
+
+ GEPValue(Instruction *I) : Inst(I) {
+ assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+ }
+
+ GEPValue(Instruction *I, std::optional<int64_t> ConstantOffset)
+ : Inst(I), ConstantOffset(ConstantOffset) {
+ assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+ }
+
+ bool isSentinel() const {
+ return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+ Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static bool canHandle(Instruction *Inst) {
+ return isa<GetElementPtrInst>(Inst);
+ }
+};
+
+} // namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<GEPValue> {
+ static inline GEPValue getEmptyKey() {
+ return DenseMapInfo<Instruction *>::getEmptyKey();
+ }
+
+ static inline GEPValue getTombstoneKey() {
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static unsigned getHashValue(const GEPValue &Val);
+ static bool isEqual(const GEPValue &LHS, const GEPValue &RHS);
+};
+
+} // end namespace llvm
+
+unsigned DenseMapInfo<GEPValue>::getHashValue(const GEPValue &Val) {
+ auto *GEP = cast<GetElementPtrInst>(Val.Inst);
+ if (Val.ConstantOffset.has_value())
+ return hash_combine(GEP->getOpcode(), GEP->getPointerOperand(),
+ Val.ConstantOffset.value());
+ return hash_combine(
+ GEP->getOpcode(),
+ hash_combine_range(GEP->value_op_begin(), GEP->value_op_end()));
+}
+
+bool DenseMapInfo<GEPValue>::isEqual(const GEPValue &LHS, const GEPValue &RHS) {
+ if (LHS.isSentinel() || RHS.isSentinel())
+ return LHS.Inst == RHS.Inst;
+ auto *LGEP = cast<GetElementPtrInst>(LHS.Inst);
+ auto *RGEP = cast<GetElementPtrInst>(RHS.Inst);
+ if (LGEP->getPointerOperand() != RGEP->getPointerOperand())
+ return false;
+ if (LHS.ConstantOffset.has_value() && RHS.ConstantOffset.has_value())
+ return LHS.ConstantOffset.value() == RHS.ConstantOffset.value();
+ return LGEP->isIdenticalToWhenDefined(RGEP);
+}
+
//===----------------------------------------------------------------------===//
// EarlyCSE implementation
//===----------------------------------------------------------------------===//
@@ -647,6 +717,13 @@ class EarlyCSE {
ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
CallHTType AvailableCalls;
+ using GEPMapAllocatorTy =
+ RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<GEPValue, Value *>>;
+ using GEPHTType = ScopedHashTable<GEPValue, Value *, DenseMapInfo<GEPValue>,
+ GEPMapAllocatorTy>;
+ GEPHTType AvailableGEPs;
+
/// This is the current generation of the memory value.
unsigned CurrentGeneration = 0;
@@ -667,9 +744,11 @@ class EarlyCSE {
class NodeScope {
public:
NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
- InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
- : Scope(AvailableValues), LoadScope(AvailableLoads),
- InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
+ InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
+ GEPHTType &AvailableGEPs)
+ : Scope(AvailableValues), LoadScope(AvailableLoads),
+ InvariantScope(AvailableInvariants), CallScope(AvailableCalls),
+ GEPScope(AvailableGEPs) {}
NodeScope(const NodeScope &) = delete;
NodeScope &operator=(const NodeScope &) = delete;
@@ -678,6 +757,7 @@ class EarlyCSE {
LoadHTType::ScopeTy LoadScope;
InvariantHTType::ScopeTy InvariantScope;
CallHTType::ScopeTy CallScope;
+ GEPHTType::ScopeTy GEPScope;
};
// Contains all the needed information to create a stack for doing a depth
@@ -688,13 +768,13 @@ class EarlyCSE {
public:
StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
- unsigned cg, DomTreeNode *n, DomTreeNode::const_iterator child,
+ GEPHTType &AvailableGEPs, unsigned cg, DomTreeNode *n,
+ DomTreeNode::const_iterator child,
DomTreeNode::const_iterator end)
: CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
EndIter(end),
Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
- AvailableCalls)
- {}
+ AvailableCalls, AvailableGEPs) {}
StackNode(const StackNode &) = delete;
StackNode &operator=(const StackNode &) = delete;
@@ -1214,6 +1294,20 @@ Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
return Result;
}
+static void combineIRFlags(Instruction &From, Value *To) {
+ if (auto *I = dyn_cast<Instruction>(To)) {
+ // If I being poison triggers UB, there is no need to drop those
+ // flags. Otherwise, only retain flags present on both I and Inst.
+ // TODO: Currently some fast-math flags are not treated as
+ // poison-generating even though they should. Until this is fixed,
+ // always retain flags present on both I and Inst for floating point
+ // instructions.
+ if (isa<FPMathOperator>(I) ||
+ (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
+ I->andIRFlags(&From);
+ }
+}
+
bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier,
const ParseMemoryInst &Later) {
// Can we remove Earlier store because of Later store?
@@ -1439,16 +1533,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
continue;
}
- if (auto *I = dyn_cast<Instruction>(V)) {
- // If I being poison triggers UB, there is no need to drop those
- // flags. Otherwise, only retain flags present on both I and Inst.
- // TODO: Currently some fast-math flags are not treated as
- // poison-generating even though they should. Until this is fixed,
- // always retain flags present on both I and Inst for floating point
- // instructions.
- if (isa<FPMathOperator>(I) || (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
- I->andIRFlags(&Inst);
- }
+ combineIRFlags(Inst, V);
Inst.replaceAllUsesWith(V);
salvageKnowledge(&Inst, &AC);
removeMSSA(Inst);
@@ -1561,6 +1646,31 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
continue;
}
+ // Compare GEP instructions based on offset.
+ if (GEPValue::canHandle(&Inst)) {
+ auto *GEP = cast<GetElementPtrInst>(&Inst);
+ APInt Offset = APInt(SQ.DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+ GEPValue GEPVal(GEP, GEP->accumulateConstantOffset(SQ.DL, Offset)
+ ? Offset.trySExtValue()
+ : std::nullopt);
+ if (Value *V = AvailableGEPs.lookup(GEPVal)) {
+ LLVM_DEBUG(dbgs() << "EarlyCSE CSE GEP: " << Inst << " to: " << *V
+ << '\n');
+ combineIRFlags(Inst, V);
+ Inst.replaceAllUsesWith(V);
+ salvageKnowledge(&Inst, &AC);
+ removeMSSA(Inst);
+ Inst.eraseFromParent();
+ Changed = true;
+ ++NumCSEGEP;
+ continue;
+ }
+
+ // Otherwise, just remember that we have this GEP.
+ AvailableGEPs.insert(GEPVal, &Inst);
+ continue;
+ }
+
// A release fence requires that all stores complete before it, but does
// not prevent the reordering of following loads 'before' the fence. As a
// result, we don't need to consider it as writing to memory and don't need
@@ -1675,7 +1785,7 @@ bool EarlyCSE::run() {
// Process the root node.
nodesToProcess.push_back(new StackNode(
AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
- CurrentGeneration, DT.getRootNode(),
+ AvailableGEPs, CurrentGeneration, DT.getRootNode(),
DT.getRootNode()->begin(), DT.getRootNode()->end()));
assert(!CurrentGeneration && "Create a new EarlyCSE instance to rerun it.");
@@ -1698,10 +1808,10 @@ bool EarlyCSE::run() {
} else if (NodeToProcess->childIter() != NodeToProcess->end()) {
// Push the next child onto the stack.
DomTreeNode *child = NodeToProcess->nextChild();
- nodesToProcess.push_back(
- new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
- AvailableCalls, NodeToProcess->childGeneration(),
- child, child->begin(), child->end()));
+ nodesToProcess.push_back(new StackNode(
+ AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
+ AvailableGEPs, NodeToProcess->childGeneration(), child,
+ child->begin(), child->end()));
} else {
// It has been processed, and there are no more children to process,
// so delete it and pop it off the stack.
diff --git a/llvm/test/Transforms/EarlyCSE/gep.ll b/llvm/test/Transforms/EarlyCSE/gep.ll
new file mode 100644
index 000000000000000..499b5ac8de0af9a
--- /dev/null
+++ b/llvm/test/Transforms/EarlyCSE/gep.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -S -passes=early-cse -earlycse-debug-hash | FileCheck %s
+; RUN: opt < %s -S -passes='early-cse<memssa>' | FileCheck %s
+
+%T1 = type { i64, i64, i64 }
+
+declare void @use_vec(<4 x ptr>);
+
+define void @foo(ptr %a, <4 x i64> %b, i64 %i) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr [[A:%.*]], <4 x i64> [[B:%.*]], i64 [[I:%.*]]) {
+; CHECK-NEXT: [[S1A:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT: [[N1D:%.*]] = getelementptr i8, ptr [[A]], i64 7
+; CHECK-NEXT: [[N1G:%.*]] = getelementptr i32, ptr [[A]], i64 1
+; CHECK-NEXT: [[N1H:%.*]] = getelementptr i8, ptr [[A]], i64 [[I]]
+; CHECK-NEXT: [[V:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: call void @use_vec(<4 x ptr> [[V]])
+; CHECK-NEXT: [[V2:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> <i64 0, i64 2, i64 1, i64 1>
+; CHECK-NEXT: call void @use_vec(<4 x ptr> [[V2]])
+; CHECK-NEXT: ret void
+;
+ %s1a = getelementptr i8, ptr %a, i64 8
+ %s1av = load i64, ptr %s1a
+ %s1b = getelementptr inbounds i8, ptr %a, i64 8
+ %s1bv = load i64, ptr %s1b
+ %s1c = getelementptr %T1, ptr %a, i64 0, i32 1
+ %s1cv = load i64, ptr %s1c
+ %n1d = getelementptr i8, ptr %a, i64 7
+ %n1dv = load i64, ptr %n1d
+ %s1e = getelementptr i64, ptr %a, i64 1
+ %s1ev = load i64, ptr %s1e
+ %s1f = getelementptr i32, ptr %a, i64 2
+ %s1fv = load i64, ptr %s1f
+ %n1g = getelementptr i32, ptr %a, i64 1
+ %n1gv = load i64, ptr %n1g
+ %n1h = getelementptr i8, ptr %a, i64 %i
+ %n1hv = load i64, ptr %n1h
+
+ %v = getelementptr i64, ptr %a, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+ call void @use_vec(<4 x ptr> %v)
+ %v2 = getelementptr i64, ptr %a, <4 x i64> <i64 0, i64 2, i64 1, i64 1>
+ call void @use_vec(<4 x ptr> %v2)
+ ret void
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll b/llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll
new file mode 100644
index 000000000000000..1c9e7a771ca19c7
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -O3 -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%Zip = type { { ptr, ptr }, { [32 x i8], { i64, i64 } } }
+
+define void @foo(ptr %a, <32 x i8> %_0) #0 {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr nocapture writeonly [[A:%.*]], <32 x i8> [[_0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: start:
+; CHECK-NEXT: store <32 x i8> [[_0]], ptr [[A]], align 1
+; CHECK-NEXT: ret void
+;
+start:
+ %z = alloca %Zip, align 8
+ %sroa_1 = getelementptr i8, ptr %z, i64 16
+ store <32 x i8> %_0, ptr %sroa_1, align 8
+ %len_ = getelementptr i8, ptr %z, i64 56
+ store i64 32, ptr %len_, align 8
+ %_1 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1
+ %_2 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1, i32 1
+ %len = load i64, ptr %_2, align 8
+ %_10 = getelementptr %Zip, ptr %z, i64 0, i32 1
+ br label %body
+
+body: ; preds = %body, %start
+ %_34 = phi ptr [ %_34i, %body ], [ %a, %start ]
+ %idx = phi i64 [ %idx_, %body ], [ 0, %start ]
+ %_34i = getelementptr i8, ptr %_34, i64 1
+ %idx_ = add i64 %idx, 1
+ store i64 0, ptr %_1, align 8
+ %_24 = getelementptr i8, ptr %_10, i64 %idx
+ %_18 = load i8, ptr %_24, align 1
+ store i8 %_18, ptr %_34, align 1
+ %_6 = icmp eq i64 %len, %idx_
+ br i1 %_6, label %exit, label %body
+
+exit: ; preds = %body
+ ret void
+}
+
+attributes #0 = { "target-cpu"="znver3" }
More information about the llvm-commits
mailing list