[llvm] [Inline][Cloning] Defer constant propagation after phi-nodes resolution (PR #87963)

Antonio Frighetto via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 8 00:06:17 PDT 2024


https://github.com/antoniofrighetto created https://github.com/llvm/llvm-project/pull/87963

A logic issue arose when inlining via `CloneAndPruneFunctionInto`, which, besides cloning, performs basic constant propagation as well. By the time a new cloned instruction is being simplified, phi-nodes are not remapped yet as the whole CFG needs to be processed first. As `VMap` state at this stage is incomplete, `threadCmpOverPHI` and similar could lead to unsound optimizations. This issue has been addressed by postponing simplification once phi-nodes are revisited.

Fixes: https://github.com/llvm/llvm-project/issues/87534.

>From 0379f2f82c7c3cd9ba70b59fdfc6232f9662ada5 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me at antoniofrighetto.com>
Date: Mon, 8 Apr 2024 08:56:59 +0200
Subject: [PATCH 1/2] [Inline][Cloning] Introduce test for PR87963 (NFC)

---
 .../Inline/inline-deferred-instsimplify.ll    | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll

diff --git a/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll b/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll
new file mode 100644
index 00000000000000..4a9c576f02719e
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=inline -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(inline)' -S | FileCheck %s
+
+%struct.a = type { i32, i32, i32, i32, i32 }
+
+ at g_var = global %struct.a { i32 1, i32 0, i32 0, i32 0, i32 0 }, align 8
+ at other_g_var = global %struct.a zeroinitializer, align 4
+
+define void @callee(ptr noundef byval(%struct.a) align 8 %ptr) {
+; CHECK-LABEL: define void @callee(
+; CHECK-SAME: ptr noundef byval([[STRUCT_A:%.*]]) align 8 [[PTR:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[PTR]], align 8
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[VAL]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[CHECK_POINTERS_ARE_EQUAL:%.*]], label [[STORE_PTR_IN_GVAR:%.*]]
+; CHECK:       store_ptr_in_gvar:
+; CHECK-NEXT:    store ptr [[PTR]], ptr @other_g_var, align 8
+; CHECK-NEXT:    br label [[CHECK_POINTERS_ARE_EQUAL]]
+; CHECK:       check_pointers_are_equal:
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[PTR]], [[STORE_PTR_IN_GVAR]] ], [ @other_g_var, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[DOTNOT1:%.*]] = icmp eq ptr [[PHI]], [[PTR]]
+; CHECK-NEXT:    br i1 [[DOTNOT1]], label [[RETURN:%.*]], label [[ABORT:%.*]]
+; CHECK:       abort:
+; CHECK-NEXT:    call void @abort()
+; CHECK-NEXT:    unreachable
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %val = load i32, ptr %ptr, align 8
+  %.not = icmp eq i32 %val, 0
+  br i1 %.not, label %check_pointers_are_equal, label %store_ptr_in_gvar
+
+store_ptr_in_gvar:                                ; preds = %entry
+  store ptr %ptr, ptr @other_g_var, align 8
+  br label %check_pointers_are_equal
+
+check_pointers_are_equal:                         ; preds = %store_ptr_in_gvar, %entry
+  %phi = phi ptr [ %ptr, %store_ptr_in_gvar ], [ @other_g_var, %entry ]
+; FIXME: While inlining, the following is miscompiled to i1 false,
+; as %ptr in the phi-node is not taken into account.
+  %.not1 = icmp eq ptr %phi, %ptr
+  br i1 %.not1, label %return, label %abort
+
+abort:                                            ; preds = %check_pointers_are_equal
+  call void @abort()
+  unreachable
+
+return:                                           ; preds = %check_pointers_are_equal
+  ret void
+}
+
+define i32 @main() {
+; CHECK-LABEL: define i32 @main() {
+; CHECK-NEXT:    [[G_VAR:%.*]] = alloca [[STRUCT_A:%.*]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr [[G_VAR]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[G_VAR]], ptr align 1 @g_var, i64 20, i1 false)
+; CHECK-NEXT:    [[VAL_I:%.*]] = load i32, ptr [[G_VAR]], align 8
+; CHECK-NEXT:    [[DOTNOT_I:%.*]] = icmp eq i32 [[VAL_I]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT_I]], label [[CHECK_POINTERS_ARE_EQUAL_I:%.*]], label [[STORE_PTR_IN_GVAR_I:%.*]]
+; CHECK:       store_ptr_in_gvar.i:
+; CHECK-NEXT:    store ptr [[G_VAR]], ptr @other_g_var, align 8
+; CHECK-NEXT:    br label [[CHECK_POINTERS_ARE_EQUAL_I]]
+; CHECK:       check_pointers_are_equal.i:
+; CHECK-NEXT:    [[PHI_I:%.*]] = phi ptr [ [[G_VAR]], [[STORE_PTR_IN_GVAR_I]] ], [ @other_g_var, [[TMP0:%.*]] ]
+; CHECK-NEXT:    call void @abort()
+; CHECK-NEXT:    unreachable
+; CHECK:       callee.exit:
+; CHECK-NEXT:    ret i32 0
+;
+  call void @callee(ptr noundef byval(%struct.a) align 8 @g_var)
+  ret i32 0
+}
+
+declare void @abort()

>From fc09adbe8e335647dd5a19ecfa4cd5540a3c409c Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me at antoniofrighetto.com>
Date: Mon, 8 Apr 2024 08:57:38 +0200
Subject: [PATCH 2/2] [Inline][Cloning] Defer constant propagation after
 phi-nodes resolution

A logic issue arose when inlining via `CloneAndPruneFunctionInto`,
which, besides cloning, performs basic constant propagation as well.
By the time a new cloned instruction is being simplified, phi-nodes
are not remapped yet as the whole CFG needs to be processed first.
As `VMap` state at this stage is incomplete, `threadCmpOverPHI` and
similar could lead to unsound optimizations. This issue has been
addressed by postponing simplification once phi-nodes are revisited.

Fixes: https://github.com/llvm/llvm-project/issues/87534.
---
 llvm/lib/Transforms/Utils/CloneFunction.cpp   | 104 +++++++-----------
 .../Inline/dynamic-alloca-simplified-large.ll |   2 +
 .../Inline/inline-deferred-instsimplify.ll    |   6 +-
 .../Transforms/Inline/prof-update-instr.ll    |   2 +-
 .../Inline/prof-update-sample-alwaysinline.ll |   1 -
 .../Transforms/Inline/prof-update-sample.ll   |   1 -
 6 files changed, 46 insertions(+), 70 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 3eac726994ae13..627174d1b354e4 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -536,29 +536,10 @@ void PruningFunctionCloner::CloneBlock(
     // Eagerly remap operands to the newly cloned instruction, except for PHI
     // nodes for which we defer processing until we update the CFG. Also defer
     // debug intrinsic processing because they may contain use-before-defs.
-    if (!isa<PHINode>(NewInst) && !isa<DbgVariableIntrinsic>(NewInst)) {
+    if (!isa<PHINode>(NewInst) && !isa<DbgVariableIntrinsic>(NewInst))
       RemapInstruction(NewInst, VMap,
                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
 
-      // If we can simplify this instruction to some other value, simply add
-      // a mapping to that value rather than inserting a new instruction into
-      // the basic block.
-      if (Value *V =
-              simplifyInstruction(NewInst, BB->getModule()->getDataLayout())) {
-        // On the off-chance that this simplifies to an instruction in the old
-        // function, map it back into the new function.
-        if (NewFunc != OldFunc)
-          if (Value *MappedV = VMap.lookup(V))
-            V = MappedV;
-
-        if (!NewInst->mayHaveSideEffects()) {
-          VMap[&*II] = V;
-          NewInst->eraseFromParent();
-          continue;
-        }
-      }
-    }
-
     if (II->hasName())
       NewInst->setName(II->getName() + NameSuffix);
     VMap[&*II] = NewInst; // Add instruction map to value.
@@ -823,52 +804,45 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
     }
   }
 
-  // Make a second pass over the PHINodes now that all of them have been
-  // remapped into the new function, simplifying the PHINode and performing any
-  // recursive simplifications exposed. This will transparently update the
-  // WeakTrackingVH in the VMap. Notably, we rely on that so that if we coalesce
-  // two PHINodes, the iteration over the old PHIs remains valid, and the
-  // mapping will just map us to the new node (which may not even be a PHI
-  // node).
-  const DataLayout &DL = NewFunc->getParent()->getDataLayout();
-  SmallSetVector<const Value *, 8> Worklist;
-  for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
-    if (isa<PHINode>(VMap[PHIToResolve[Idx]]))
-      Worklist.insert(PHIToResolve[Idx]);
-
-  // Note that we must test the size on each iteration, the worklist can grow.
-  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
-    const Value *OrigV = Worklist[Idx];
-    auto *I = dyn_cast_or_null<Instruction>(VMap.lookup(OrigV));
-    if (!I)
-      continue;
-
-    // Skip over non-intrinsic callsites, we don't want to remove any nodes from
-    // the CGSCC.
-    CallBase *CB = dyn_cast<CallBase>(I);
-    if (CB && CB->getCalledFunction() &&
-        !CB->getCalledFunction()->isIntrinsic())
-      continue;
-
-    // See if this instruction simplifies.
-    Value *SimpleV = simplifyInstruction(I, DL);
-    if (!SimpleV)
-      continue;
-
-    // Stash away all the uses of the old instruction so we can check them for
-    // recursive simplifications after a RAUW. This is cheaper than checking all
-    // uses of To on the recursive step in most cases.
-    for (const User *U : OrigV->users())
-      Worklist.insert(cast<Instruction>(U));
-
-    // Replace the instruction with its simplified value.
-    I->replaceAllUsesWith(SimpleV);
+  auto GetOriginalValueInVMap = [&](const auto &I) -> Value * {
+    for (const auto &[OrigI, NewI] : VMap) {
+      if (NewI == I)
+        return const_cast<Value *>(OrigI);
+    }
+    return nullptr;
+  };
 
-    // If the original instruction had no side effects, remove it.
-    if (isInstructionTriviallyDead(I))
-      I->eraseFromParent();
-    else
-      VMap[OrigV] = I;
+  // As phi-nodes have been now remapped, allow incremental simplification of
+  // newly-cloned instructions.
+  const DataLayout &DL = NewFunc->getParent()->getDataLayout();
+  auto *NewEntryBlock = cast<BasicBlock>(VMap[StartingBB]);
+  ReversePostOrderTraversal<BasicBlock *> RPOT(NewEntryBlock);
+  for (auto &BB : RPOT) {
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
+      auto *NewI = &*II++;
+      if (isa<DbgInfoIntrinsic>(NewI))
+        continue;
+
+      CallBase *CB = dyn_cast<CallBase>(NewI);
+      if (CB && CB->getCalledFunction() &&
+          !CB->getCalledFunction()->isIntrinsic())
+        continue;
+
+      if (Value *V = simplifyInstruction(NewI, DL)) {
+        if (!NewI->use_empty())
+          NewI->replaceAllUsesWith(V);
+
+        if (isInstructionTriviallyDead(NewI)) {
+          NewI->eraseFromParent();
+        } else {
+          // Did not erase it? Restore the new instruction into VMap.
+          auto *OriginalI = GetOriginalValueInVMap(NewI);
+          assert(OriginalI != nullptr &&
+                 "Previously remapped value was not found?");
+          VMap[OriginalI] = NewI;
+        }
+      }
+    }
   }
 
   // Remap debug intrinsic operands now that all values have been mapped.
diff --git a/llvm/test/Transforms/Inline/dynamic-alloca-simplified-large.ll b/llvm/test/Transforms/Inline/dynamic-alloca-simplified-large.ll
index 9b293d39c85f95..54e2010f0b0e9f 100644
--- a/llvm/test/Transforms/Inline/dynamic-alloca-simplified-large.ll
+++ b/llvm/test/Transforms/Inline/dynamic-alloca-simplified-large.ll
@@ -107,6 +107,8 @@ define void @caller3_alloca_not_in_entry(ptr %p1, i1 %b) {
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i1 [[B]], true
 ; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[SPLIT:%.*]]
 ; CHECK:       split:
+; CHECK-NEXT:    [[SAVEDSTACK:%.*]] = call ptr @llvm.stacksave.p0()
+; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[SAVEDSTACK]])
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll b/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll
index 4a9c576f02719e..f02d03688f0397 100644
--- a/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll
+++ b/llvm/test/Transforms/Inline/inline-deferred-instsimplify.ll
@@ -38,8 +38,6 @@ store_ptr_in_gvar:                                ; preds = %entry
 
 check_pointers_are_equal:                         ; preds = %store_ptr_in_gvar, %entry
   %phi = phi ptr [ %ptr, %store_ptr_in_gvar ], [ @other_g_var, %entry ]
-; FIXME: While inlining, the following is miscompiled to i1 false,
-; as %ptr in the phi-node is not taken into account.
   %.not1 = icmp eq ptr %phi, %ptr
   br i1 %.not1, label %return, label %abort
 
@@ -64,9 +62,13 @@ define i32 @main() {
 ; CHECK-NEXT:    br label [[CHECK_POINTERS_ARE_EQUAL_I]]
 ; CHECK:       check_pointers_are_equal.i:
 ; CHECK-NEXT:    [[PHI_I:%.*]] = phi ptr [ [[G_VAR]], [[STORE_PTR_IN_GVAR_I]] ], [ @other_g_var, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[DOTNOT1_I:%.*]] = icmp eq ptr [[PHI_I]], [[G_VAR]]
+; CHECK-NEXT:    br i1 [[DOTNOT1_I]], label [[CALLEE_EXIT:%.*]], label [[ABORT_I:%.*]]
+; CHECK:       abort.i:
 ; CHECK-NEXT:    call void @abort()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       callee.exit:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr [[G_VAR]])
 ; CHECK-NEXT:    ret i32 0
 ;
   call void @callee(ptr noundef byval(%struct.a) align 8 @g_var)
diff --git a/llvm/test/Transforms/Inline/prof-update-instr.ll b/llvm/test/Transforms/Inline/prof-update-instr.ll
index 38dfa67dcacdc5..01a515bc8e293e 100644
--- a/llvm/test/Transforms/Inline/prof-update-instr.ll
+++ b/llvm/test/Transforms/Inline/prof-update-instr.ll
@@ -52,6 +52,6 @@ define void @caller() !prof !21 {
 !21 = !{!"function_entry_count", i64 400}
 attributes #0 = { alwaysinline }
 ; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
-; CHECK: ![[COUNT_IND_CALLEE1]] = !{!"VP", i32 0, i64 200, i64 111, i64 100, i64 222, i64 60, i64 333, i64 40}
+; CHECK: ![[COUNT_IND_CALLEE1]] = !{!"VP", i32 0, i64 120, i64 111, i64 60, i64 222, i64 36, i64 333, i64 24}
 ; CHECK: ![[COUNT_IND_CALLEE]] = !{!"VP", i32 0, i64 84, i64 111, i64 48, i64 222, i64 24, i64 333, i64 12}
 ; CHECK: ![[COUNT_IND_CALLER]] = !{!"VP", i32 0, i64 56, i64 111, i64 32, i64 222, i64 16, i64 333, i64 8}
diff --git a/llvm/test/Transforms/Inline/prof-update-sample-alwaysinline.ll b/llvm/test/Transforms/Inline/prof-update-sample-alwaysinline.ll
index d6b771e2629d2e..8af4d89663a4da 100644
--- a/llvm/test/Transforms/Inline/prof-update-sample-alwaysinline.ll
+++ b/llvm/test/Transforms/Inline/prof-update-sample-alwaysinline.ll
@@ -53,7 +53,6 @@ define void @caller() {
 !18 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20}
 attributes #0 = { alwaysinline }
 ; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
-; CHECK: ![[COUNT_CALLEE1]] = !{!"branch_weights", i32 2000}
 ; CHECK: ![[COUNT_CALLEE]] = !{!"branch_weights", i32 1200}
 ; CHECK: ![[COUNT_IND_CALLEE]] = !{!"VP", i32 0, i64 84, i64 111, i64 48, i64 222, i64 24, i64 333, i64 12}
 ; CHECK: ![[COUNT_CALLER]] = !{!"branch_weights", i32 800}
diff --git a/llvm/test/Transforms/Inline/prof-update-sample.ll b/llvm/test/Transforms/Inline/prof-update-sample.ll
index 6cdd70e84e0c6d..e09b859b698120 100644
--- a/llvm/test/Transforms/Inline/prof-update-sample.ll
+++ b/llvm/test/Transforms/Inline/prof-update-sample.ll
@@ -52,7 +52,6 @@ define void @caller() {
 !17 = !{!"branch_weights", i32 400}
 !18 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20}
 ; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
-; CHECK: ![[COUNT_CALLEE1]] = !{!"branch_weights", i32 2000}
 ; CHECK: ![[COUNT_CALLEE]] = !{!"branch_weights", i32 1200}
 ; CHECK: ![[COUNT_IND_CALLEE]] = !{!"VP", i32 0, i64 84, i64 111, i64 48, i64 222, i64 24, i64 333, i64 12}
 ; CHECK: ![[COUNT_CALLER]] = !{!"branch_weights", i32 800}



More information about the llvm-commits mailing list