[llvm] 2d1e8a0 - [EarlyCSE] Compare GEP instructions based on offset (#65875)

via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 19 15:14:49 PDT 2023


Author: DianQK
Date: 2023-09-20T06:14:45+08:00
New Revision: 2d1e8a03f5eeff48cd7928d003fc12f728b2c7cf

URL: https://github.com/llvm/llvm-project/commit/2d1e8a03f5eeff48cd7928d003fc12f728b2c7cf
DIFF: https://github.com/llvm/llvm-project/commit/2d1e8a03f5eeff48cd7928d003fc12f728b2c7cf.diff

LOG: [EarlyCSE] Compare GEP instructions based on offset (#65875)

Closes #65763.
This will provide more opportunities for constant propagation for
subsequent optimizations.

Added: 
    llvm/test/Transforms/EarlyCSE/gep.ll
    llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll

Modified: 
    llvm/lib/Transforms/Scalar/EarlyCSE.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 439235f47471efb..4990fa9f8b5ea36 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -67,6 +67,7 @@ STATISTIC(NumCSE,      "Number of instructions CSE'd");
 STATISTIC(NumCSECVP,   "Number of compare instructions CVP'd");
 STATISTIC(NumCSELoad,  "Number of load instructions CSE'd");
 STATISTIC(NumCSECall,  "Number of call instructions CSE'd");
+STATISTIC(NumCSEGEP, "Number of GEP instructions CSE'd");
 STATISTIC(NumDSE,      "Number of trivial dead stores removed");
 
 DEBUG_COUNTER(CSECounter, "early-cse",
@@ -143,11 +144,11 @@ struct SimpleValue {
              !CI->getFunction()->isPresplitCoroutine();
     }
     return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
-           isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
-           isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
-           isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
-           isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
-           isa<InsertValueInst>(Inst) || isa<FreezeInst>(Inst);
+           isa<BinaryOperator>(Inst) || isa<CmpInst>(Inst) ||
+           isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+           isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+           isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst) ||
+           isa<FreezeInst>(Inst);
   }
 };
 
@@ -307,10 +308,9 @@ static unsigned getHashValueImpl(SimpleValue Val) {
                         IVI->getOperand(1),
                         hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
 
-  assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
-          isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
-          isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst) ||
-          isa<FreezeInst>(Inst)) &&
+  assert((isa<CallInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+          isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+          isa<UnaryOperator>(Inst) || isa<FreezeInst>(Inst)) &&
          "Invalid/unknown instruction");
 
   // Handle intrinsics with commutative operands.
@@ -548,11 +548,81 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
   // currently executing, so conservatively return false if they are in
   // 
diff erent basic blocks.
   if (LHSI->isConvergent() && LHSI->getParent() != RHSI->getParent())
-      return false;
+    return false;
 
   return LHSI->isIdenticalTo(RHSI);
 }
 
+//===----------------------------------------------------------------------===//
+// GEPValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct GEPValue {
+  Instruction *Inst;
+  std::optional<int64_t> ConstantOffset;
+
+  GEPValue(Instruction *I) : Inst(I) {
+    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+  }
+
+  GEPValue(Instruction *I, std::optional<int64_t> ConstantOffset)
+      : Inst(I), ConstantOffset(ConstantOffset) {
+    assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+  }
+
+  bool isSentinel() const {
+    return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+           Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static bool canHandle(Instruction *Inst) {
+    return isa<GetElementPtrInst>(Inst);
+  }
+};
+
+} // namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<GEPValue> {
+  static inline GEPValue getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+
+  static inline GEPValue getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+
+  static unsigned getHashValue(const GEPValue &Val);
+  static bool isEqual(const GEPValue &LHS, const GEPValue &RHS);
+};
+
+} // end namespace llvm
+
+unsigned DenseMapInfo<GEPValue>::getHashValue(const GEPValue &Val) {
+  auto *GEP = cast<GetElementPtrInst>(Val.Inst);
+  if (Val.ConstantOffset.has_value())
+    return hash_combine(GEP->getOpcode(), GEP->getPointerOperand(),
+                        Val.ConstantOffset.value());
+  return hash_combine(
+      GEP->getOpcode(),
+      hash_combine_range(GEP->value_op_begin(), GEP->value_op_end()));
+}
+
+bool DenseMapInfo<GEPValue>::isEqual(const GEPValue &LHS, const GEPValue &RHS) {
+  if (LHS.isSentinel() || RHS.isSentinel())
+    return LHS.Inst == RHS.Inst;
+  auto *LGEP = cast<GetElementPtrInst>(LHS.Inst);
+  auto *RGEP = cast<GetElementPtrInst>(RHS.Inst);
+  if (LGEP->getPointerOperand() != RGEP->getPointerOperand())
+    return false;
+  if (LHS.ConstantOffset.has_value() && RHS.ConstantOffset.has_value())
+    return LHS.ConstantOffset.value() == RHS.ConstantOffset.value();
+  return LGEP->isIdenticalToWhenDefined(RGEP);
+}
+
 //===----------------------------------------------------------------------===//
 // EarlyCSE implementation
 //===----------------------------------------------------------------------===//
@@ -647,6 +717,13 @@ class EarlyCSE {
       ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
   CallHTType AvailableCalls;
 
+  using GEPMapAllocatorTy =
+      RecyclingAllocator<BumpPtrAllocator,
+                         ScopedHashTableVal<GEPValue, Value *>>;
+  using GEPHTType = ScopedHashTable<GEPValue, Value *, DenseMapInfo<GEPValue>,
+                                    GEPMapAllocatorTy>;
+  GEPHTType AvailableGEPs;
+
   /// This is the current generation of the memory value.
   unsigned CurrentGeneration = 0;
 
@@ -667,9 +744,11 @@ class EarlyCSE {
   class NodeScope {
   public:
     NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
-              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
-      : Scope(AvailableValues), LoadScope(AvailableLoads),
-        InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
+              InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
+              GEPHTType &AvailableGEPs)
+        : Scope(AvailableValues), LoadScope(AvailableLoads),
+          InvariantScope(AvailableInvariants), CallScope(AvailableCalls),
+          GEPScope(AvailableGEPs) {}
     NodeScope(const NodeScope &) = delete;
     NodeScope &operator=(const NodeScope &) = delete;
 
@@ -678,6 +757,7 @@ class EarlyCSE {
     LoadHTType::ScopeTy LoadScope;
     InvariantHTType::ScopeTy InvariantScope;
     CallHTType::ScopeTy CallScope;
+    GEPHTType::ScopeTy GEPScope;
   };
 
   // Contains all the needed information to create a stack for doing a depth
@@ -688,13 +768,13 @@ class EarlyCSE {
   public:
     StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
               InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
-              unsigned cg, DomTreeNode *n, DomTreeNode::const_iterator child,
+              GEPHTType &AvailableGEPs, unsigned cg, DomTreeNode *n,
+              DomTreeNode::const_iterator child,
               DomTreeNode::const_iterator end)
         : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
           EndIter(end),
           Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
-                 AvailableCalls)
-          {}
+                 AvailableCalls, AvailableGEPs) {}
     StackNode(const StackNode &) = delete;
     StackNode &operator=(const StackNode &) = delete;
 
@@ -1214,6 +1294,20 @@ Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
   return Result;
 }
 
+static void combineIRFlags(Instruction &From, Value *To) {
+  if (auto *I = dyn_cast<Instruction>(To)) {
+    // If I being poison triggers UB, there is no need to drop those
+    // flags. Otherwise, only retain flags present on both I and Inst.
+    // TODO: Currently some fast-math flags are not treated as
+    // poison-generating even though they should. Until this is fixed,
+    // always retain flags present on both I and Inst for floating point
+    // instructions.
+    if (isa<FPMathOperator>(I) ||
+        (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
+      I->andIRFlags(&From);
+  }
+}
+
 bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier,
                                 const ParseMemoryInst &Later) {
   // Can we remove Earlier store because of Later store?
@@ -1439,16 +1533,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
           LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
           continue;
         }
-        if (auto *I = dyn_cast<Instruction>(V)) {
-          // If I being poison triggers UB, there is no need to drop those
-          // flags. Otherwise, only retain flags present on both I and Inst.
-          // TODO: Currently some fast-math flags are not treated as
-          // poison-generating even though they should. Until this is fixed,
-          // always retain flags present on both I and Inst for floating point
-          // instructions.
-          if (isa<FPMathOperator>(I) || (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I)))
-            I->andIRFlags(&Inst);
-        }
+        combineIRFlags(Inst, V);
         Inst.replaceAllUsesWith(V);
         salvageKnowledge(&Inst, &AC);
         removeMSSA(Inst);
@@ -1561,6 +1646,31 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       continue;
     }
 
+    // Compare GEP instructions based on offset.
+    if (GEPValue::canHandle(&Inst)) {
+      auto *GEP = cast<GetElementPtrInst>(&Inst);
+      APInt Offset = APInt(SQ.DL.getIndexTypeSizeInBits(GEP->getType()), 0);
+      GEPValue GEPVal(GEP, GEP->accumulateConstantOffset(SQ.DL, Offset)
+                               ? Offset.trySExtValue()
+                               : std::nullopt);
+      if (Value *V = AvailableGEPs.lookup(GEPVal)) {
+        LLVM_DEBUG(dbgs() << "EarlyCSE CSE GEP: " << Inst << "  to: " << *V
+                          << '\n');
+        combineIRFlags(Inst, V);
+        Inst.replaceAllUsesWith(V);
+        salvageKnowledge(&Inst, &AC);
+        removeMSSA(Inst);
+        Inst.eraseFromParent();
+        Changed = true;
+        ++NumCSEGEP;
+        continue;
+      }
+
+      // Otherwise, just remember that we have this GEP.
+      AvailableGEPs.insert(GEPVal, &Inst);
+      continue;
+    }
+
     // A release fence requires that all stores complete before it, but does
     // not prevent the reordering of following loads 'before' the fence.  As a
     // result, we don't need to consider it as writing to memory and don't need
@@ -1675,7 +1785,7 @@ bool EarlyCSE::run() {
   // Process the root node.
   nodesToProcess.push_back(new StackNode(
       AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
-      CurrentGeneration, DT.getRootNode(),
+      AvailableGEPs, CurrentGeneration, DT.getRootNode(),
       DT.getRootNode()->begin(), DT.getRootNode()->end()));
 
   assert(!CurrentGeneration && "Create a new EarlyCSE instance to rerun it.");
@@ -1698,10 +1808,10 @@ bool EarlyCSE::run() {
     } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
       // Push the next child onto the stack.
       DomTreeNode *child = NodeToProcess->nextChild();
-      nodesToProcess.push_back(
-          new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
-                        AvailableCalls, NodeToProcess->childGeneration(),
-                        child, child->begin(), child->end()));
+      nodesToProcess.push_back(new StackNode(
+          AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
+          AvailableGEPs, NodeToProcess->childGeneration(), child,
+          child->begin(), child->end()));
     } else {
       // It has been processed, and there are no more children to process,
       // so delete it and pop it off the stack.

diff  --git a/llvm/test/Transforms/EarlyCSE/gep.ll b/llvm/test/Transforms/EarlyCSE/gep.ll
new file mode 100644
index 000000000000000..499b5ac8de0af9a
--- /dev/null
+++ b/llvm/test/Transforms/EarlyCSE/gep.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -S -passes=early-cse -earlycse-debug-hash | FileCheck %s
+; RUN: opt < %s -S -passes='early-cse<memssa>' | FileCheck %s
+
+%T1 = type { i64, i64, i64 }
+
+declare void @use_vec(<4 x ptr>);
+
+define void @foo(ptr %a, <4 x i64> %b, i64 %i) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr [[A:%.*]], <4 x i64> [[B:%.*]], i64 [[I:%.*]]) {
+; CHECK-NEXT:    [[S1A:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[N1D:%.*]] = getelementptr i8, ptr [[A]], i64 7
+; CHECK-NEXT:    [[N1G:%.*]] = getelementptr i32, ptr [[A]], i64 1
+; CHECK-NEXT:    [[N1H:%.*]] = getelementptr i8, ptr [[A]], i64 [[I]]
+; CHECK-NEXT:    [[V:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    call void @use_vec(<4 x ptr> [[V]])
+; CHECK-NEXT:    [[V2:%.*]] = getelementptr i64, ptr [[A]], <4 x i64> <i64 0, i64 2, i64 1, i64 1>
+; CHECK-NEXT:    call void @use_vec(<4 x ptr> [[V2]])
+; CHECK-NEXT:    ret void
+;
+  %s1a = getelementptr i8, ptr %a, i64 8
+  %s1av = load i64, ptr %s1a
+  %s1b = getelementptr inbounds i8, ptr %a, i64 8
+  %s1bv = load i64, ptr %s1b
+  %s1c = getelementptr %T1, ptr %a, i64 0, i32 1
+  %s1cv = load i64, ptr %s1c
+  %n1d = getelementptr i8, ptr %a, i64 7
+  %n1dv = load i64, ptr %n1d
+  %s1e = getelementptr i64, ptr %a, i64 1
+  %s1ev = load i64, ptr %s1e
+  %s1f = getelementptr i32, ptr %a, i64 2
+  %s1fv = load i64, ptr %s1f
+  %n1g = getelementptr i32, ptr %a, i64 1
+  %n1gv = load i64, ptr %n1g
+  %n1h = getelementptr i8, ptr %a, i64 %i
+  %n1hv = load i64, ptr %n1h
+
+  %v = getelementptr i64, ptr %a, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+  call void @use_vec(<4 x ptr> %v)
+  %v2 = getelementptr i64, ptr %a, <4 x i64> <i64 0, i64 2, i64 1, i64 1>
+  call void @use_vec(<4 x ptr> %v2)
+  ret void
+}

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll b/llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll
new file mode 100644
index 000000000000000..1c9e7a771ca19c7
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/unroll-vectorizer.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -O3 -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%Zip = type { { ptr, ptr }, { [32 x i8], { i64, i64 } } }
+
+define void @foo(ptr %a, <32 x i8> %_0) #0 {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr nocapture writeonly [[A:%.*]], <32 x i8> [[_0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    store <32 x i8> [[_0]], ptr [[A]], align 1
+; CHECK-NEXT:    ret void
+;
+start:
+  %z = alloca %Zip, align 8
+  %sroa_1 = getelementptr i8, ptr %z, i64 16
+  store <32 x i8> %_0, ptr %sroa_1, align 8
+  %len_ = getelementptr i8, ptr %z, i64 56
+  store i64 32, ptr %len_, align 8
+  %_1 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1
+  %_2 = getelementptr %Zip, ptr %z, i64 0, i32 1, i32 1, i32 1
+  %len = load i64, ptr %_2, align 8
+  %_10 = getelementptr %Zip, ptr %z, i64 0, i32 1
+  br label %body
+
+body:                                             ; preds = %body, %start
+  %_34 = phi ptr [ %_34i, %body ], [ %a, %start ]
+  %idx = phi i64 [ %idx_, %body ], [ 0, %start ]
+  %_34i = getelementptr i8, ptr %_34, i64 1
+  %idx_ = add i64 %idx, 1
+  store i64 0, ptr %_1, align 8
+  %_24 = getelementptr i8, ptr %_10, i64 %idx
+  %_18 = load i8, ptr %_24, align 1
+  store i8 %_18, ptr %_34, align 1
+  %_6 = icmp eq i64 %len, %idx_
+  br i1 %_6, label %exit, label %body
+
+exit:                                             ; preds = %body
+  ret void
+}
+
+attributes #0 = { "target-cpu"="znver3" }


        


More information about the llvm-commits mailing list