[clang] WIP Add suppress weak loop assumptions visitor (PR #111258)
via cfe-commits
cfe-commits at lists.llvm.org
Sat Oct 5 08:11:57 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-static-analyzer-1
Author: Balazs Benics (steakhal)
<details>
<summary>Changes</summary>
This is an alternative implementation of the idea present in #<!-- -->109804.
In this implementation, the suppression is purely implemented by a `BugReportVisitor`, to avoid coupling the suppression with the engine itself.
The idea is to visit the bug-path, and along the way record each time we entered the basic-block that has a comparison in the end that drives the decision of entering or avoiding a loop construct.
We also record expressions that caused a state-split. These are basically the expressions that eagerly assume would force a split on - but this approach works even if eager-assumptions are disabled.
Once we collected all the data, we go over the loops that we saw, and check if the condition of that loop construct was evaluated N times. This then triggers a heuristic to decide if we should suppress this, e.g. because we "assumed" not to enter the loop, or "assumed" to iterate more than 2 times in the loop.
---
This is WIP, because I just scratched the idea, but it seems to work good enough to have some discussion about this, while refining it if I have some time later.
---
Full diff: https://github.com/llvm/llvm-project/pull/111258.diff
2 Files Affected:
- (modified) clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp (+164-1)
- (modified) clang/test/Analysis/out-of-bounds.c (+149-1)
``````````diff
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 3f837564cf47c4..aafd44d037273d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -11,18 +11,28 @@
//
//===----------------------------------------------------------------------===//
+#include "clang/AST/ASTContext.h"
#include "clang/AST/CharUnits.h"
+#include "clang/AST/Expr.h"
#include "clang/AST/ParentMapContext.h"
+#include "clang/AST/Stmt.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Analysis/ProgramPoint.h"
#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
#include "clang/StaticAnalyzer/Checkers/Taint.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h"
#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
#include "clang/StaticAnalyzer/Core/Checker.h"
#include "clang/StaticAnalyzer/Core/CheckerManager.h"
#include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
#include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h"
#include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
-#include "llvm/ADT/SmallString.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState_Fwd.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/raw_ostream.h"
#include <optional>
@@ -32,6 +42,158 @@ using namespace ento;
using namespace taint;
using llvm::formatv;
+namespace {
+
+class SuppressReportsHavingWeakLoopAssumption : public BugReporterVisitor {
+public:
+ SuppressReportsHavingWeakLoopAssumption(const PathSensitiveBugReport &R)
+ : OrigCursor(R.getErrorNode()) {}
+
+private:
+ friend class BugReporterVisitor;
+
+ // The exploded node we are currently visiting, but in the original exploded
+ // graph. This graph is not trimmed, unlike the one we usually visit.
+ const ExplodedNode *OrigCursor;
+
+ llvm::DenseSet<const Stmt *> LoopsSeen;
+ llvm::DenseMap<const Expr *, unsigned> NumTimesTakenTrueDecision;
+ llvm::DenseMap<const Expr *, unsigned> NumTimesTakenFalseDecision;
+
+ void Profile(llvm::FoldingSetNodeID &ID) const final {
+ static const bool Tag = 0;
+ ID.AddPointer(&Tag);
+ }
+
+ static const ExplodedNode *selectMatchingPred(const ExplodedNode *OrigSucc,
+ unsigned SoughtPredId) {
+ auto MatchesSoughtId = [SoughtPredId](const ExplodedNode *N) {
+ return N->getID() == SoughtPredId;
+ };
+ auto It = llvm::find_if(OrigSucc->preds(), MatchesSoughtId);
+ assert(It != OrigSucc->pred_end());
+ return *It;
+ }
+
+ static bool isLoopAndNonNull(const Stmt *S) {
+ return isa_and_nonnull<ForStmt, WhileStmt, CXXForRangeStmt, DoStmt>(S);
+ }
+
+ void registerLoopStatements(const ExplodedNode *N) {
+ if (auto Entrance = N->getLocation().getAs<BlockEntrance>()) {
+ const Stmt *TermStmt = Entrance->getBlock()->getTerminatorStmt();
+ if (isLoopAndNonNull(TermStmt))
+ LoopsSeen.insert(TermStmt);
+ }
+ }
+
+ void registerOccurrence(llvm::DenseMap<const Expr *, unsigned> &Map,
+ const Expr *Key) {
+ if (auto [Place, Inserted] = Map.try_emplace(Key, 1); !Inserted)
+ ++Place->second;
+ }
+
+ PathDiagnosticPieceRef VisitNode(const ExplodedNode *Succ,
+ BugReporterContext &BRC,
+ PathSensitiveBugReport &BR) final {
+ OrigCursor = selectMatchingPred(OrigCursor, Succ->getID());
+ assert(OrigCursor->getID() == Succ->getID());
+
+ registerLoopStatements(Succ);
+
+ auto AsPostStmt = Succ->getLocationAs<PostStmt>();
+ const Expr *CurrExpr =
+ AsPostStmt ? dyn_cast<Expr>(AsPostStmt->getStmt()) : nullptr;
+
+ bool IsSplittingTwoWays = OrigCursor->succ_size() == 2;
+
+ if (!CurrExpr || !IsSplittingTwoWays)
+ return nullptr;
+
+ const ExplodedNode *Next = Succ->getFirstSucc();
+ SValBuilder &SVB = Succ->getState()->getStateManager().getSValBuilder();
+ auto Query = SVB.evalCast(Succ->getSVal(CurrExpr), SVB.getConditionType(),
+ CurrExpr->getType())
+ .getAs<DefinedOrUnknownSVal>();
+ if (!Query)
+ return nullptr;
+
+ ProgramStateRef TakenTrueBranch = Next->getState()->assume(*Query, true);
+ registerOccurrence(TakenTrueBranch ? NumTimesTakenTrueDecision
+ : NumTimesTakenFalseDecision,
+ CurrExpr);
+
+ return nullptr;
+ }
+
+ static const Expr *getCond(const Stmt *S) {
+ assert(isLoopAndNonNull(S));
+ switch (S->getStmtClass()) {
+ case Stmt::StmtClass::ForStmtClass:
+ return cast<ForStmt>(S)->getCond();
+ case Stmt::StmtClass::WhileStmtClass:
+ return cast<WhileStmt>(S)->getCond();
+ case Stmt::StmtClass::CXXForRangeStmtClass:
+ return cast<CXXForRangeStmt>(S)->getCond();
+ case Stmt::StmtClass::DoStmtClass:
+ return cast<DoStmt>(S)->getCond();
+ default:
+ return nullptr;
+ }
+ }
+
+ void trySuppressReport(PathSensitiveBugReport &R, const Stmt *Loop,
+ const Expr *CondSubExpr) const {
+ unsigned NumTimesTakenLoopBody =
+ NumTimesTakenTrueDecision.lookup(CondSubExpr);
+ unsigned NumTimesNotTakenLoopBody =
+ NumTimesTakenFalseDecision.lookup(CondSubExpr);
+
+ // Adjust for do-while loops, where the loop body is always taken.
+ if (isa<DoStmt>(Loop))
+ NumTimesTakenLoopBody += 1;
+
+ // Suppress the report if it avoided a loop with an eager assumption.
+ if (NumTimesTakenLoopBody == 0 && NumTimesNotTakenLoopBody == 1) {
+ // llvm::errs() << "eagerly decided never taking the loop body\n";
+ R.markInvalid("eagerly decided never taking the loop body", CondSubExpr);
+ return;
+ }
+
+ // Suppress the report if it iterates with eager assumptions 3 or more
+ // times.
+ if (NumTimesTakenLoopBody > 2 && NumTimesNotTakenLoopBody == 0) {
+ // llvm::errs() << "eagerly taken the loop body at least 2 times\n";
+ R.markInvalid("eagerly taken the loop body at least 2 times",
+ CondSubExpr);
+ }
+ }
+
+ void finalizeVisitor(BugReporterContext &BRC, const ExplodedNode *,
+ PathSensitiveBugReport &R) final {
+ ASTContext &Ctx = BRC.getASTContext();
+
+ // Go over all the loops we have seen (either avoided or entered), and check
+ // if the condition of such loops were eagerly decided.
+ for (const Stmt *Loop : LoopsSeen) {
+ if (const Expr *Cond = getCond(Loop)) {
+ // Let's try all sub-exprs to cover cases that use short-circuiting.
+ // We need to check if any sub-expression of the condition was eagerly
+ // decided, because the '&&' and '||' logical operators could
+ // short-circuit, thus the expression on which we recorded the
+ // "eager decision" was a sub-expression of the Loop condition.
+ using namespace ast_matchers;
+ for (auto Binding : match(findAll(expr().bind("e")), *Cond, Ctx)) {
+ trySuppressReport(R, Loop, Binding.getNodeAs<Expr>("e"));
+ if (!R.isValid())
+ return;
+ }
+ }
+ }
+ }
+};
+} // namespace
+
namespace {
/// If `E` is a "clean" array subscript expression, return the type of the
/// accessed element. If the base of the subscript expression is modified by
@@ -722,6 +884,7 @@ void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
if (Extent)
markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
+ BR->addVisitor<SuppressReportsHavingWeakLoopAssumption>(*BR);
C.emitReport(std::move(BR));
}
diff --git a/clang/test/Analysis/out-of-bounds.c b/clang/test/Analysis/out-of-bounds.c
index 1f771c2b3bd138..1b62b2eed4ee84 100644
--- a/clang/test/Analysis/out-of-bounds.c
+++ b/clang/test/Analysis/out-of-bounds.c
@@ -1,4 +1,9 @@
-// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,alpha.security.ArrayBoundV2,debug.ExprInspection -verify %s
+// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,alpha.security.ArrayBoundV2,debug.ExprInspection -verify=expected,eagerlyassume %s
+// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,alpha.security.ArrayBoundV2,debug.ExprInspection -analyzer-config eagerly-assume=false -verify %s
+
+// Note that eagerly-assume=false is tested separately because the
+// WeakLoopAssumption suppression heuristic uses different code paths to
+// achieve the same result with and without eagerly-assume.
void clang_analyzer_eval(int);
@@ -194,3 +199,146 @@ char test_comparison_with_extent_symbol(struct incomplete *p) {
return ((char *)p)[-1]; // no-warning
}
+// WeakLoopAssumption suppression
+///////////////////////////////////////////////////////////////////////
+
+int GlobalArray[100];
+int loop_suppress_after_zero_iterations(unsigned len) {
+ for (unsigned i = 0; i < len; i++)
+ if (GlobalArray[i] > 0)
+ return GlobalArray[i];
+ // Previously this would have produced an overflow warning because splitting
+ // the state on the loop condition introduced an execution path where the
+ // analyzer thinks that len == 0.
+ // There are many situations where the programmer knows that an argument is
+ // positive, but this is not indicated in the source code, so we must avoid
+ // reporting errors (especially out of bounds errors) on these branches,
+ // because otherwise we'd get prohibitively many false positives.
+ return GlobalArray[len - 1]; // no-warning
+}
+
+void loop_report_in_second_iteration(int len) {
+ int buf[1] = {0};
+ for (int i = 0; i < len; i++) {
+ // When a programmer writes a loop, we may assume that they intended at
+ // least two iterations.
+ buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+ }
+}
+
+void loop_suppress_in_third_iteration(int len) {
+ int buf[2] = {0};
+ for (int i = 0; i < len; i++) {
+ // We should suppress array bounds errors on the third and later iterations
+ // of loops, because sometimes programmers write a loop in sitiuations
+ // where they know that there will be at most two iterations, but the
+ // analyzer cannot deduce this property.
+ buf[i] = 1; // no-warning
+ }
+}
+
+int no_suppression_when_no_assumption_zero_iterations(unsigned len) {
+ if (len != 0) {
+ // This 'if' introduces a state split between len == 0 and len != 0.
+ }
+
+ // On the branch where we assumed that len is zero, this loop will be
+ // skipped. We (intentionally) don't suppress this execution path becasue
+ // here the analyzer doesn't assume anything new when it evaluates the loop
+ // condition.
+ for (unsigned i = 0; i < len; i++)
+ if (GlobalArray[i] > 0)
+ return GlobalArray[i];
+
+ return GlobalArray[len - 1]; // expected-warning{{Out of bound access to memory}}
+}
+
+void no_suppression_when_no_assumption_third_iteration(int len) {
+ if (len < 20) {
+ // This 'if' introduces a state split with len >= 20 on one branch.
+ }
+
+ int buf[2] = {0};
+ for (int i = 0; i < len; i++) {
+ // As in no_suppression_when_no_assumption_zero_iterations, the suppression
+ // only activates when the analyzer assumes something new in the loop
+ // condition. On the branch where `len >= 20` entering the third iteration
+ // doesn't involve a new assumption, so this warning is not suppressed:
+ buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+ }
+}
+
+void loop_suppress_in_third_iteration_logical_and(int len, int flag) {
+ int buf[2] = {0};
+ for (int i = 0; i < len && flag; i++) {
+ // FIXME: In this case the checker should suppress the warning the same way
+ // as it's suppressed in loop_suppress_in_third_iteration, but the
+ // suppression is not activated because the terminator statement associated
+ // with the loop is just the expression 'flag', while 'i < len' is a
+ // separate terminator statement that's associated with the
+ // short-circuiting operator '&&'.
+ // I have seen a real-world FP that looks like this, but it is much rarer
+ // than the basic setup.
+ buf[i] = 1; // no-warning
+ }
+}
+
+void loop_suppress_in_third_iteration_logical_and_2(int len, int flag) {
+ int buf[2] = {0};
+ for (int i = 0; flag && i < len; i++) {
+ // If the two operands of '&&' are flipped, the suppression works, because
+ // then 'flag' is the terminator statement associated with '&&' (which
+ // determines whether the short-circuiting happens or not) and 'i < len' is
+ // the terminator statement of the loop itself.
+ buf[i] = 1; // no-warning
+ }
+}
+
+void loop_suppress_in_third_iteration_cast(int len) {
+ int buf[2] = {0};
+ for (int i = 0; (unsigned)(i < len); i++) {
+ // The behavior of this suppression is slightly different under
+ // eagerly-assume=true (the default) and eagerly-assume=false:
+ // * When eager assumptions are disabled, it's enough to look for cases
+ // where we get two non-null states from splitting the state over the
+ // 'SVal' that represents the full loop condition.
+ // * When eager assumptions are enabled, we also accept situations when the
+ // loop condition expression triggers an eager state split and therefore
+ // we won't see a state split at the "normal" point because it's executed
+ // on two already separated execution paths.
+ // However, for the sake of simplicity we don't activate the suppression in
+ // cases when _a subexpression_ of the loop condition triggers an eager
+ // assumption. There are already many differences between analysis with and
+ // without eager assumptions, so it would be pointless to write more
+ // complicated code to eliminate these rare differences.
+ buf[i] = 1; // no-warning
+ }
+}
+
+int coinflip(void);
+int do_while_report_after_one_iteration(void) {
+ int i = 0;
+ do {
+ i++;
+ } while (coinflip());
+ // Unlike `loop_suppress_after_zero_iterations`, running just one iteration
+ // in a do-while is not a corner case that would produce too many false
+ // positives, so don't suppress bounds errors in these situations.
+ return GlobalArray[i-2]; // expected-warning{{Out of bound access to memory}}
+}
+
+void do_while_report_in_second_iteration(int len) {
+ int buf[1] = {0};
+ int i = 0;
+ do {
+ buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+ } while (i++ < len);
+}
+
+void do_while_suppress_in_third_iteration(int len) {
+ int buf[2] = {0};
+ int i = 0;
+ do {
+ buf[i] = 1; // no-warning
+ } while (i++ < len);
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/111258
More information about the cfe-commits
mailing list