[clang] WIP Add suppress weak loop assumptions visitor (PR #111258)

Sat Oct 5 08:11:57 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-clang-static-analyzer-1

Author: Balazs Benics (steakhal)

<details>
<summary>Changes</summary>

This is an alternative implementation of the idea present in #109804.

In this implementation, the suppression is purely implemented by a `BugReportVisitor`, to avoid coupling the suppression with the engine itself.
The idea is to visit the bug-path, and along the way record each time we entered the basic-block that has a comparison in the end that drives the decision of entering or avoiding a loop construct.

We also record expressions that caused a state-split. These are basically the expressions that eagerly assume would force a split on - but this approach works even if eager-assumptions are disabled.

Once we collected all the data, we go over the loops that we saw, and check if the condition of that loop construct was evaluated N times. This then triggers a heuristic to decide if we should suppress this, e.g. because we "assumed" not to enter the loop, or "assumed" to iterate more than 2 times in the loop.


---

This is WIP, because I just scratched the idea, but it seems to work good enough to have some discussion about this, while refining it if I have some time later.

---
Full diff: https://github.com/llvm/llvm-project/pull/111258.diff


2 Files Affected:

- (modified) clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp (+164-1) 
- (modified) clang/test/Analysis/out-of-bounds.c (+149-1) 


``````````diff

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 3f837564cf47c4..aafd44d037273d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -11,18 +11,28 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/AST/ASTContext.h"
 #include "clang/AST/CharUnits.h"
+#include "clang/AST/Expr.h"
 #include "clang/AST/ParentMapContext.h"
+#include "clang/AST/Stmt.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Analysis/ProgramPoint.h"
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Checkers/Taint.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
-#include "llvm/ADT/SmallString.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState_Fwd.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
@@ -32,6 +42,158 @@ using namespace ento;
 using namespace taint;
 using llvm::formatv;
 
+namespace {
+
+class SuppressReportsHavingWeakLoopAssumption : public BugReporterVisitor {
+public:
+  SuppressReportsHavingWeakLoopAssumption(const PathSensitiveBugReport &R)
+      : OrigCursor(R.getErrorNode()) {}
+
+private:
+  friend class BugReporterVisitor;
+
+  // The exploded node we are currently visiting, but in the original exploded
+  // graph. This graph is not trimmed, unlike the one we usually visit.
+  const ExplodedNode *OrigCursor;
+
+  llvm::DenseSet<const Stmt *> LoopsSeen;
+  llvm::DenseMap<const Expr *, unsigned> NumTimesTakenTrueDecision;
+  llvm::DenseMap<const Expr *, unsigned> NumTimesTakenFalseDecision;
+
+  void Profile(llvm::FoldingSetNodeID &ID) const final {
+    static const bool Tag = 0;
+    ID.AddPointer(&Tag);
+  }
+
+  static const ExplodedNode *selectMatchingPred(const ExplodedNode *OrigSucc,
+                                                unsigned SoughtPredId) {
+    auto MatchesSoughtId = [SoughtPredId](const ExplodedNode *N) {
+      return N->getID() == SoughtPredId;
+    };
+    auto It = llvm::find_if(OrigSucc->preds(), MatchesSoughtId);
+    assert(It != OrigSucc->pred_end());
+    return *It;
+  }
+
+  static bool isLoopAndNonNull(const Stmt *S) {
+    return isa_and_nonnull<ForStmt, WhileStmt, CXXForRangeStmt, DoStmt>(S);
+  }
+
+  void registerLoopStatements(const ExplodedNode *N) {
+    if (auto Entrance = N->getLocation().getAs<BlockEntrance>()) {
+      const Stmt *TermStmt = Entrance->getBlock()->getTerminatorStmt();
+      if (isLoopAndNonNull(TermStmt))
+        LoopsSeen.insert(TermStmt);
+    }
+  }
+
+  void registerOccurrence(llvm::DenseMap<const Expr *, unsigned> &Map,
+                          const Expr *Key) {
+    if (auto [Place, Inserted] = Map.try_emplace(Key, 1); !Inserted)
+      ++Place->second;
+  }
+
+  PathDiagnosticPieceRef VisitNode(const ExplodedNode *Succ,
+                                   BugReporterContext &BRC,
+                                   PathSensitiveBugReport &BR) final {
+    OrigCursor = selectMatchingPred(OrigCursor, Succ->getID());
+    assert(OrigCursor->getID() == Succ->getID());
+
+    registerLoopStatements(Succ);
+
+    auto AsPostStmt = Succ->getLocationAs<PostStmt>();
+    const Expr *CurrExpr =
+        AsPostStmt ? dyn_cast<Expr>(AsPostStmt->getStmt()) : nullptr;
+
+    bool IsSplittingTwoWays = OrigCursor->succ_size() == 2;
+
+    if (!CurrExpr || !IsSplittingTwoWays)
+      return nullptr;
+
+    const ExplodedNode *Next = Succ->getFirstSucc();
+    SValBuilder &SVB = Succ->getState()->getStateManager().getSValBuilder();
+    auto Query = SVB.evalCast(Succ->getSVal(CurrExpr), SVB.getConditionType(),
+                              CurrExpr->getType())
+                     .getAs<DefinedOrUnknownSVal>();
+    if (!Query)
+      return nullptr;
+
+    ProgramStateRef TakenTrueBranch = Next->getState()->assume(*Query, true);
+    registerOccurrence(TakenTrueBranch ? NumTimesTakenTrueDecision
+                                       : NumTimesTakenFalseDecision,
+                       CurrExpr);
+
+    return nullptr;
+  }
+
+  static const Expr *getCond(const Stmt *S) {
+    assert(isLoopAndNonNull(S));
+    switch (S->getStmtClass()) {
+    case Stmt::StmtClass::ForStmtClass:
+      return cast<ForStmt>(S)->getCond();
+    case Stmt::StmtClass::WhileStmtClass:
+      return cast<WhileStmt>(S)->getCond();
+    case Stmt::StmtClass::CXXForRangeStmtClass:
+      return cast<CXXForRangeStmt>(S)->getCond();
+    case Stmt::StmtClass::DoStmtClass:
+      return cast<DoStmt>(S)->getCond();
+    default:
+      return nullptr;
+    }
+  }
+
+  void trySuppressReport(PathSensitiveBugReport &R, const Stmt *Loop,
+                         const Expr *CondSubExpr) const {
+    unsigned NumTimesTakenLoopBody =
+        NumTimesTakenTrueDecision.lookup(CondSubExpr);
+    unsigned NumTimesNotTakenLoopBody =
+        NumTimesTakenFalseDecision.lookup(CondSubExpr);
+
+    // Adjust for do-while loops, where the loop body is always taken.
+    if (isa<DoStmt>(Loop))
+      NumTimesTakenLoopBody += 1;
+
+    // Suppress the report if it avoided a loop with an eager assumption.
+    if (NumTimesTakenLoopBody == 0 && NumTimesNotTakenLoopBody == 1) {
+      // llvm::errs() << "eagerly decided never taking the loop body\n";
+      R.markInvalid("eagerly decided never taking the loop body", CondSubExpr);
+      return;
+    }
+
+    // Suppress the report if it iterates with eager assumptions 3 or more
+    // times.
+    if (NumTimesTakenLoopBody > 2 && NumTimesNotTakenLoopBody == 0) {
+      // llvm::errs() << "eagerly taken the loop body at least 2 times\n";
+      R.markInvalid("eagerly taken the loop body at least 2 times",
+                    CondSubExpr);
+    }
+  }
+
+  void finalizeVisitor(BugReporterContext &BRC, const ExplodedNode *,
+                       PathSensitiveBugReport &R) final {
+    ASTContext &Ctx = BRC.getASTContext();
+
+    // Go over all the loops we have seen (either avoided or entered), and check
+    // if the condition of such loops were eagerly decided.
+    for (const Stmt *Loop : LoopsSeen) {
+      if (const Expr *Cond = getCond(Loop)) {
+        // Let's try all sub-exprs to cover cases that use short-circuiting.
+        // We need to check if any sub-expression of the condition was eagerly
+        // decided, because the '&&' and '||' logical operators could
+        // short-circuit, thus the expression on which we recorded the
+        // "eager decision" was a sub-expression of the Loop condition.
+        using namespace ast_matchers;
+        for (auto Binding : match(findAll(expr().bind("e")), *Cond, Ctx)) {
+          trySuppressReport(R, Loop, Binding.getNodeAs<Expr>("e"));
+          if (!R.isValid())
+            return;
+        }
+      }
+    }
+  }
+};
+} // namespace
+
 namespace {
 /// If `E` is a "clean" array subscript expression, return the type of the
 /// accessed element. If the base of the subscript expression is modified by
@@ -722,6 +884,7 @@ void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
   if (Extent)
     markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
 
+  BR->addVisitor<SuppressReportsHavingWeakLoopAssumption>(*BR);
   C.emitReport(std::move(BR));
 }
 
diff --git a/clang/test/Analysis/out-of-bounds.c b/clang/test/Analysis/out-of-bounds.c
index 1f771c2b3bd138..1b62b2eed4ee84 100644
--- a/clang/test/Analysis/out-of-bounds.c
+++ b/clang/test/Analysis/out-of-bounds.c
@@ -1,4 +1,9 @@
-// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,alpha.security.ArrayBoundV2,debug.ExprInspection -verify %s
+// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,alpha.security.ArrayBoundV2,debug.ExprInspection -verify=expected,eagerlyassume %s
+// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,alpha.security.ArrayBoundV2,debug.ExprInspection -analyzer-config eagerly-assume=false -verify %s
+
+// Note that eagerly-assume=false is tested separately because the
+// WeakLoopAssumption suppression heuristic uses different code paths to
+// achieve the same result with and without eagerly-assume.
 
 void clang_analyzer_eval(int);
 
@@ -194,3 +199,146 @@ char test_comparison_with_extent_symbol(struct incomplete *p) {
   return ((char *)p)[-1]; // no-warning
 }
 
+// WeakLoopAssumption suppression
+///////////////////////////////////////////////////////////////////////
+
+int GlobalArray[100];
+int loop_suppress_after_zero_iterations(unsigned len) {
+  for (unsigned i = 0; i < len; i++)
+    if (GlobalArray[i] > 0)
+      return GlobalArray[i];
+  // Previously this would have produced an overflow warning because splitting
+  // the state on the loop condition introduced an execution path where the
+  // analyzer thinks that len == 0.
+  // There are many situations where the programmer knows that an argument is
+  // positive, but this is not indicated in the source code, so we must avoid
+  // reporting errors (especially out of bounds errors) on these branches,
+  // because otherwise we'd get prohibitively many false positives.
+  return GlobalArray[len - 1]; // no-warning
+}
+
+void loop_report_in_second_iteration(int len) {
+  int buf[1] = {0};
+  for (int i = 0; i < len; i++) {
+    // When a programmer writes a loop, we may assume that they intended at
+    // least two iterations.
+    buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+  }
+}
+
+void loop_suppress_in_third_iteration(int len) {
+  int buf[2] = {0};
+  for (int i = 0; i < len; i++) {
+    // We should suppress array bounds errors on the third and later iterations
+    // of loops, because sometimes programmers write a loop in sitiuations
+    // where they know that there will be at most two iterations, but the
+    // analyzer cannot deduce this property.
+    buf[i] = 1; // no-warning
+  }
+}
+
+int no_suppression_when_no_assumption_zero_iterations(unsigned len) {
+  if (len != 0) {
+    // This 'if' introduces a state split between len == 0 and len != 0.
+  }
+
+  // On the branch where we assumed that len is zero, this loop will be
+  // skipped. We (intentionally) don't suppress this execution path becasue
+  // here the analyzer doesn't assume anything new when it evaluates the loop
+  // condition.
+  for (unsigned i = 0; i < len; i++)
+    if (GlobalArray[i] > 0)
+      return GlobalArray[i];
+
+  return GlobalArray[len - 1]; // expected-warning{{Out of bound access to memory}}
+}
+
+void no_suppression_when_no_assumption_third_iteration(int len) {
+  if (len < 20) {
+    // This 'if' introduces a state split with len >= 20 on one branch.
+  }
+
+  int buf[2] = {0};
+  for (int i = 0; i < len; i++) {
+    // As in no_suppression_when_no_assumption_zero_iterations, the suppression
+    // only activates when the analyzer assumes something new in the loop
+    // condition. On the branch where `len >= 20` entering the third iteration
+    // doesn't involve a new assumption, so this warning is not suppressed:
+    buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+  }
+}
+
+void loop_suppress_in_third_iteration_logical_and(int len, int flag) {
+  int buf[2] = {0};
+  for (int i = 0; i < len && flag; i++) {
+    // FIXME: In this case the checker should suppress the warning the same way
+    // as it's suppressed in loop_suppress_in_third_iteration, but the
+    // suppression is not activated because the terminator statement associated
+    // with the loop is just the expression 'flag', while 'i < len' is a
+    // separate terminator statement that's associated with the
+    // short-circuiting operator '&&'.
+    // I have seen a real-world FP that looks like this, but it is much rarer
+    // than the basic setup.
+    buf[i] = 1; // no-warning
+  }
+}
+
+void loop_suppress_in_third_iteration_logical_and_2(int len, int flag) {
+  int buf[2] = {0};
+  for (int i = 0; flag && i < len; i++) {
+    // If the two operands of '&&' are flipped, the suppression works, because
+    // then 'flag' is the terminator statement associated with '&&' (which
+    // determines whether the short-circuiting happens or not) and 'i < len' is
+    // the terminator statement of the loop itself.
+    buf[i] = 1; // no-warning
+  }
+}
+
+void loop_suppress_in_third_iteration_cast(int len) {
+  int buf[2] = {0};
+  for (int i = 0; (unsigned)(i < len); i++) {
+    // The behavior of this suppression is slightly different under
+    // eagerly-assume=true (the default) and eagerly-assume=false:
+    // * When eager assumptions are disabled, it's enough to look for cases
+    //   where we get two non-null states from splitting the state over the
+    //   'SVal' that represents the full loop condition. 
+    // * When eager assumptions are enabled, we also accept situations when the
+    //   loop condition expression triggers an eager state split and therefore
+    //   we won't see a state split at the "normal" point because it's executed
+    //   on two already separated execution paths.
+    // However, for the sake of simplicity we don't activate the suppression in
+    // cases when _a subexpression_ of the loop condition triggers an eager
+    // assumption. There are already many differences between analysis with and
+    // without eager assumptions, so it would be pointless to write more
+    // complicated code to eliminate these rare differences.
+    buf[i] = 1; // no-warning
+  }
+}
+
+int coinflip(void);
+int do_while_report_after_one_iteration(void) {
+  int i = 0;
+  do {
+    i++;
+  } while (coinflip());
+  // Unlike `loop_suppress_after_zero_iterations`, running just one iteration
+  // in a do-while is not a corner case that would produce too many false
+  // positives, so don't suppress bounds errors in these situations.
+  return GlobalArray[i-2]; // expected-warning{{Out of bound access to memory}}
+}
+
+void do_while_report_in_second_iteration(int len) {
+  int buf[1] = {0};
+  int i = 0;
+  do {
+    buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+  } while (i++ < len);
+}
+
+void do_while_suppress_in_third_iteration(int len) {
+  int buf[2] = {0};
+  int i = 0;
+  do {
+    buf[i] = 1; // no-warning
+  } while (i++ < len);
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/111258