[llvm] b827e7c - [SelectOpti] Restrict load sinking

Sotiris Apostolakis via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 16 13:55:10 PDT 2022


Author: Sotiris Apostolakis
Date: 2022-09-16T20:50:46Z
New Revision: b827e7c6007476504af67290cda2935810dfcacd

URL: https://github.com/llvm/llvm-project/commit/b827e7c6007476504af67290cda2935810dfcacd
DIFF: https://github.com/llvm/llvm-project/commit/b827e7c6007476504af67290cda2935810dfcacd.diff

LOG: [SelectOpti] Restrict load sinking

This is a follow-up to D133777, which resolved a use-after-free case but
did not cover all possible memory bugs due to misplacement of loads.

In short, the overall problem was that sinked loads could be moved after
state-modifying instructions leading to memory bugs.

The solution is to restrict load sinking unless it is found to be sound.
i) Within a basic block (to-be-sinked load and select-user are in the same BB),
loads can be sinked only if there is no intervening state-modifying instruction.
This is a conservative approach to avoid resorting to alias analysis to detect
potential memory overlap.
ii) Across basic blocks, sinking of loads is avoided. This is because going over
multiple basic blocks looking for memory conflicts could be computationally
expensive and also unlikely to allow loads to sink. Further, experiments showed
that not sinking these loads has a slight positive performance effect.
Maybe for some of these loads, having some separation allows enough time
for the load to be executed in time for its user. This is not the case for
floating point operations that benefit more from sinking.

The solution in D133777 was essentially undone in this patch,
since the latter is a complete solution to the observed problem.

Overall, the performance impact of this patch is minimal.
Tested on two internal Google workloads with instrPGO.
Search application showed <0.05% perf difference,
while the database one showed a slight improvement,
but not statistically significant.

Reviewed By: davidxl

Differential Revision: https://reviews.llvm.org/D133999

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectOptimize.cpp
    llvm/test/CodeGen/X86/select-optimize.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index ff1df22a48f7f..3fdf1fcd0e93d 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -29,7 +29,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/ProfDataUtils.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -182,7 +181,7 @@ class SelectOptimize : public FunctionPass {
   // consisting of instructions exclusively computed for producing the operands
   // of the source instruction.
   void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
-                             bool ForSinking = false);
+                             Instruction *SI, bool ForSinking = false);
 
   // Returns true if the condition of the select is highly predictable.
   bool isSelectHighlyPredictable(const SelectInst *SI);
@@ -377,13 +376,13 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
       // false operands.
       if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) {
         std::stack<Instruction *> TrueSlice;
-        getExclBackwardsSlice(TI, TrueSlice, true);
+        getExclBackwardsSlice(TI, TrueSlice, SI, true);
         maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
         TrueSlices.push_back(TrueSlice);
       }
       if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) {
         std::stack<Instruction *> FalseSlice;
-        getExclBackwardsSlice(FI, FalseSlice, true);
+        getExclBackwardsSlice(FI, FalseSlice, SI, true);
         maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
         FalseSlices.push_back(FalseSlice);
       }
@@ -470,22 +469,6 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
       auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
       FalseBranch->setDebugLoc(SI->getDebugLoc());
     }
-    // If there was sinking, move any lifetime-end intrinsic calls found in the
-    // StartBlock to the newly-created end block to ensure sound lifetime info
-    // after sinking (in case any use is sinked).
-    else {
-      SmallVector<Instruction *, 2> EndLifetimeCalls;
-      for (Instruction &I : *StartBlock) {
-        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
-          if (II->getIntrinsicID() == Intrinsic::lifetime_end) {
-            EndLifetimeCalls.push_back(&I);
-          }
-        }
-      }
-      for (auto *LC : EndLifetimeCalls) {
-        LC->moveBefore(&*EndBlock->getFirstInsertionPt());
-      }
-    }
 
     // Insert the real conditional branch based on the original condition.
     // If we did not create a new block for one of the 'true' or 'false' paths
@@ -700,7 +683,7 @@ bool SelectOptimize::hasExpensiveColdOperand(
     }
     if (ColdI) {
       std::stack<Instruction *> ColdSlice;
-      getExclBackwardsSlice(ColdI, ColdSlice);
+      getExclBackwardsSlice(ColdI, ColdSlice, SI);
       InstructionCost SliceCost = 0;
       while (!ColdSlice.empty()) {
         SliceCost += TTI->getInstructionCost(ColdSlice.top(),
@@ -721,6 +704,22 @@ bool SelectOptimize::hasExpensiveColdOperand(
   return false;
 }
 
+// Check if it is safe to move LoadI next to the SI.
+// Conservatively assume it is safe only if there is no instruction
+// modifying memory in-between the load and the select instruction.
+static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) {
+  // Assume loads from 
diff erent basic blocks are unsafe to move.
+  if (LoadI->getParent() != SI->getParent())
+    return false;
+  auto It = LoadI->getIterator();
+  while (&*It != SI) {
+    if (It->mayWriteToMemory())
+      return false;
+    It++;
+  }
+  return true;
+}
+
 // For a given source instruction, collect its backwards dependence slice
 // consisting of instructions exclusively computed for the purpose of producing
 // the operands of the source instruction. As an approximation
@@ -729,7 +728,7 @@ bool SelectOptimize::hasExpensiveColdOperand(
 // form an one-use chain that leads to the source instruction.
 void SelectOptimize::getExclBackwardsSlice(Instruction *I,
                                            std::stack<Instruction *> &Slice,
-                                           bool ForSinking) {
+                                           Instruction *SI, bool ForSinking) {
   SmallPtrSet<Instruction *, 2> Visited;
   std::queue<Instruction *> Worklist;
   Worklist.push(I);
@@ -751,6 +750,13 @@ void SelectOptimize::getExclBackwardsSlice(Instruction *I,
                        isa<SelectInst>(II) || isa<PHINode>(II)))
       continue;
 
+    // Avoid sinking loads in order not to skip state-modifying instructions,
+    // that may alias with the loaded address.
+    // Only allow sinking of loads within the same basic block that are
+    // conservatively proven to be safe.
+    if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI))
+      continue;
+
     // Avoid considering instructions with less frequency than the source
     // instruction (i.e., avoid colder code regions of the dependence slice).
     if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))

diff  --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll
index ebc53bea6e13a..a53dd36d813e3 100644
--- a/llvm/test/CodeGen/X86/select-optimize.ll
+++ b/llvm/test/CodeGen/X86/select-optimize.ll
@@ -185,9 +185,8 @@ define i32 @expensive_val_operand2(ptr nocapture %a, i32 %x, i1 %cmp) {
   ret i32 %sel
 }
 
-; Cold value operand with load in its one-use dependence slice shoud result
+; Cold value operand with load in its one-use dependence slice should result
 ; into a branch with sinked dependence slice.
-; The lifetime-end intrinsics should be soundly preserved.
 define i32 @expensive_val_operand3(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
 ; CHECK-LABEL: @expensive_val_operand3(
 ; CHECK-NEXT:    [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
@@ -198,7 +197,47 @@ define i32 @expensive_val_operand3(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
 ; CHECK-NEXT:    br label [[SELECT_END]]
 ; CHECK:       select.end:
 ; CHECK-NEXT:    [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %load = load i32, ptr %a, align 8
+  %x = add i32 %load, %b
+  %sel = select i1 %cmp, i32 %x, i32 %y, !prof !17
+  ret i32 %sel
+}
+
+; Expensive cold value operand with unsafe-to-sink (due to func call) load (partial slice sinking).
+define i32 @expensive_val_operand4(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
+; CHECK-LABEL: @expensive_val_operand4(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    call void @free(ptr [[A]])
+; CHECK-NEXT:    [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
+; CHECK-NEXT:    br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]]
+; CHECK:       select.true.sink:
+; CHECK-NEXT:    [[X:%.*]] = add i32 [[LOAD]], [[B:%.*]]
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %load = load i32, ptr %a, align 8
+  call void @free(ptr %a)
+  %x = add i32 %load, %b
+  %sel = select i1 %cmp, i32 %x, i32 %y, !prof !17
+  ret i32 %sel
+}
+
+; Expensive cold value operand with unsafe-to-sink (due to lifetime-end marker) load (partial slice sinking).
+define i32 @expensive_val_operand5(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
+; CHECK-LABEL: @expensive_val_operand5(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[A]])
+; CHECK-NEXT:    [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
+; CHECK-NEXT:    br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]]
+; CHECK:       select.true.sink:
+; CHECK-NEXT:    [[X:%.*]] = add i32 [[LOAD]], [[B:%.*]]
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[TMP0:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %load = load i32, ptr %a, align 8
@@ -208,9 +247,35 @@ define i32 @expensive_val_operand3(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
   ret i32 %sel
 }
 
+; Expensive cold value operand with potentially-unsafe-to-sink load (located
+; in a 
diff erent basic block and thus unchecked for sinkability).
+define i32 @expensive_val_operand6(ptr nocapture %a, i32 %b, i32 %y, i1 %cmp) {
+; CHECK-LABEL: @expensive_val_operand6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[SEL_FROZEN:%.*]] = freeze i1 [[CMP:%.*]]
+; CHECK-NEXT:    br i1 [[SEL_FROZEN]], label [[SELECT_TRUE_SINK:%.*]], label [[SELECT_END:%.*]], !prof [[PROF18]]
+; CHECK:       select.true.sink:
+; CHECK-NEXT:    [[X:%.*]] = add i32 [[LOAD]], [[B:%.*]]
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[SEL:%.*]] = phi i32 [ [[X]], [[SELECT_TRUE_SINK]] ], [ [[Y:%.*]], [[BB1]] ]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+entry:
+  %load = load i32, ptr %a, align 8
+  br label %bb1
+bb1:                                 ; preds = %entry
+  %x = add i32 %load, %b
+  %sel = select i1 %cmp, i32 %x, i32 %y, !prof !17
+  ret i32 %sel
+}
+
 ; Multiple uses of the load value operand.
-define i32 @expensive_val_operand4(i32 %a, ptr nocapture %b, i32 %x, i1 %cmp) {
-; CHECK-LABEL: @expensive_val_operand4(
+define i32 @expensive_val_operand7(i32 %a, ptr nocapture %b, i32 %x, i1 %cmp) {
+; CHECK-LABEL: @expensive_val_operand7(
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP:%.*]], i32 [[X:%.*]], i32 [[LOAD]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SEL]], [[LOAD]]
@@ -453,6 +518,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; Function Attrs: argmemonly mustprogress nocallback nofree nosync nounwind willreturn
 declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
+declare void @free(ptr nocapture)
+
 !llvm.module.flags = !{!0, !26, !27}
 !0 = !{i32 1, !"ProfileSummary", !1}
 !1 = !{!2, !3, !4, !5, !6, !7, !8, !9}


        


More information about the llvm-commits mailing list