[clang] WIP Add suppress weak loop assumptions visitor (PR #111258)

Sat Oct 5 08:11:27 PDT 2024

https://github.com/steakhal created https://github.com/llvm/llvm-project/pull/111258

This is an alternative implementation of the idea present in #109804.

In this implementation, the suppression is purely implemented by a `BugReportVisitor`, to avoid coupling the suppression with the engine itself.
The idea is to visit the bug-path, and along the way record each time we entered the basic-block that has a comparison in the end that drives the decision of entering or avoiding a loop construct.

We also record expressions that caused a state-split. These are basically the expressions that eagerly assume would force a split on - but this approach works even if eager-assumptions are disabled.

Once we collected all the data, we go over the loops that we saw, and check if the condition of that loop construct was evaluated N times. This then triggers a heuristic to decide if we should suppress this, e.g. because we "assumed" not to enter the loop, or "assumed" to iterate more than 2 times in the loop.


---

This is WIP, because I just scratched the idea, but it seems to work good enough to have some discussion about this, while refining it if I have some time later.

>From c429df63af6cdeec7e721e8f9c322f1ac6ba3b3d Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs at gmail.com>
Date: Sat, 5 Oct 2024 15:14:37 +0200
Subject: [PATCH 1/5] Use the eager assumption tags only if the decision was
 ambiguous

---
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  | 15 ++++++++----
 clang/test/Analysis/assuming-unsigned-ge-0.c  |  5 +---
 clang/test/Analysis/cast-value-notes.cpp      | 20 +++++++---------
 clang/test/Analysis/cast-value-state-dump.cpp |  5 ++--
 .../std-c-library-functions-arg-constraints.c | 24 +++++++------------
 5 files changed, 29 insertions(+), 40 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index b1919d7027cf4d..62066632db87d0 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -3771,24 +3771,29 @@ void ExprEngine::evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst,
     SVal V = state->getSVal(Ex, Pred->getLocationContext());
     std::optional<nonloc::SymbolVal> SEV = V.getAs<nonloc::SymbolVal>();
     if (SEV && SEV->isExpression()) {
-      const std::pair<const ProgramPointTag *, const ProgramPointTag*> &tags =
-        geteagerlyAssumeBinOpBifurcationTags();
-
       ProgramStateRef StateTrue, StateFalse;
       std::tie(StateTrue, StateFalse) = state->assume(*SEV);
 
+      const ProgramPointTag *EagerTrueTag = nullptr;
+      const ProgramPointTag *EagerFalseTag = nullptr;
+
+      // Use the eager assumption tags if the choice was ambiguous.
+      if (StateTrue && StateFalse)
+        std::tie(EagerTrueTag, EagerFalseTag) =
+            geteagerlyAssumeBinOpBifurcationTags();
+
       // First assume that the condition is true.
       if (StateTrue) {
         SVal Val = svalBuilder.makeIntVal(1U, Ex->getType());
         StateTrue = StateTrue->BindExpr(Ex, Pred->getLocationContext(), Val);
-        Bldr.generateNode(Ex, Pred, StateTrue, tags.first);
+        Bldr.generateNode(Ex, Pred, StateTrue, EagerTrueTag);
       }
 
       // Next, assume that the condition is false.
       if (StateFalse) {
         SVal Val = svalBuilder.makeIntVal(0U, Ex->getType());
         StateFalse = StateFalse->BindExpr(Ex, Pred->getLocationContext(), Val);
-        Bldr.generateNode(Ex, Pred, StateFalse, tags.second);
+        Bldr.generateNode(Ex, Pred, StateFalse, EagerFalseTag);
       }
     }
   }
diff --git a/clang/test/Analysis/assuming-unsigned-ge-0.c b/clang/test/Analysis/assuming-unsigned-ge-0.c
index 553e68cb96c6bd..18e637a6e5310e 100644
--- a/clang/test/Analysis/assuming-unsigned-ge-0.c
+++ b/clang/test/Analysis/assuming-unsigned-ge-0.c
@@ -2,10 +2,7 @@
 // RUN:     -analyzer-checker=core -verify %s
 
 int assuming_unsigned_ge_0(unsigned arg) {
-  // TODO This testcase demonstrates the current incorrect behavior of Clang
-  // Static Analyzer: here 'arg' is unsigned, so "arg >= 0" is not a fresh
-  // assumption, but it still appears in the diagnostics as if it's fresh:
-  // expected-note at +2 {{Assuming 'arg' is >= 0}}
+  // expected-note at +2 {{'arg' is >= 0}}
   // expected-note at +1 {{Taking false branch}}
   if (arg < 0)
     return 0;
diff --git a/clang/test/Analysis/cast-value-notes.cpp b/clang/test/Analysis/cast-value-notes.cpp
index 7ee224dc6e5d8f..0235404e73adf6 100644
--- a/clang/test/Analysis/cast-value-notes.cpp
+++ b/clang/test/Analysis/cast-value-notes.cpp
@@ -168,9 +168,8 @@ void evalNonNullParamNonNullReturnReference(const Shape &S) {
     // expected-note at -2 {{Taking true branch}}
 
     (void)(1 / !C);
-    // expected-note at -1 {{'C' is non-null}}
-    // expected-note at -2 {{Division by zero}}
-    // expected-warning at -3 {{Division by zero}}
+    // expected-note at -1 {{Division by zero}}
+    // expected-warning at -2 {{Division by zero}}
   }
 }
 
@@ -226,9 +225,8 @@ void evalNonNullParamNonNullReturn(const Shape *S) {
     // expected-note at -2 {{Taking true branch}}
 
     (void)(1 / !C);
-    // expected-note at -1 {{'C' is non-null}}
-    // expected-note at -2 {{Division by zero}}
-    // expected-warning at -3 {{Division by zero}}
+    // expected-note at -1 {{Division by zero}}
+    // expected-warning at -2 {{Division by zero}}
   }
 }
 
@@ -243,9 +241,8 @@ void evalNonNullParamNullReturn(const Shape *S) {
     // expected-note at -4 {{Taking true branch}}
 
     (void)(1 / !T);
-    // expected-note at -1 {{'T' is non-null}}
-    // expected-note at -2 {{Division by zero}}
-    // expected-warning at -3 {{Division by zero}}
+    // expected-note at -1 {{Division by zero}}
+    // expected-warning at -2 {{Division by zero}}
   }
 }
 
@@ -265,9 +262,8 @@ void evalZeroParamNonNullReturnPointer(const Shape *S) {
   // expected-note at -2 {{'C' initialized here}}
 
   (void)(1 / !C);
-  // expected-note at -1 {{'C' is non-null}}
-  // expected-note at -2 {{Division by zero}}
-  // expected-warning at -3 {{Division by zero}}
+  // expected-note at -1 {{Division by zero}}
+  // expected-warning at -2 {{Division by zero}}
 }
 
 void evalZeroParamNonNullReturn(const Shape &S) {
diff --git a/clang/test/Analysis/cast-value-state-dump.cpp b/clang/test/Analysis/cast-value-state-dump.cpp
index 07fd7abd848ab7..97f3b2236db087 100644
--- a/clang/test/Analysis/cast-value-state-dump.cpp
+++ b/clang/test/Analysis/cast-value-state-dump.cpp
@@ -40,8 +40,7 @@ void evalNonNullParamNonNullReturn(const Shape *S) {
   // CHECK-NEXT:   ] }
 
   (void)(1 / !C);
-  // expected-note at -1 {{'C' is non-null}}
-  // expected-note at -2 {{Division by zero}}
-  // expected-warning at -3 {{Division by zero}}
+  // expected-note at -1 {{Division by zero}}
+  // expected-warning at -2 {{Division by zero}}
 }
 
diff --git a/clang/test/Analysis/std-c-library-functions-arg-constraints.c b/clang/test/Analysis/std-c-library-functions-arg-constraints.c
index 0b817dda98c727..a6e0ad15129f60 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-constraints.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-constraints.c
@@ -42,8 +42,7 @@ void test_alnum_symbolic(int x) {
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
   // bugpath-note{{TRUE}} \
-  // bugpath-note{{Left side of '&&' is true}} \
-  // bugpath-note{{'x' is <= 255}}
+  // bugpath-note{{Left side of '&&' is true}}
 }
 
 void test_alnum_symbolic2(int x) {
@@ -76,8 +75,7 @@ void test_toupper_symbolic(int x) {
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
   // bugpath-note{{TRUE}} \
-  // bugpath-note{{Left side of '&&' is true}} \
-  // bugpath-note{{'x' is <= 255}}
+  // bugpath-note{{Left side of '&&' is true}}
 }
 
 void test_toupper_symbolic2(int x) {
@@ -110,8 +108,7 @@ void test_tolower_symbolic(int x) {
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
   // bugpath-note{{TRUE}} \
-  // bugpath-note{{Left side of '&&' is true}} \
-  // bugpath-note{{'x' is <= 255}}
+  // bugpath-note{{Left side of '&&' is true}}
 }
 
 void test_tolower_symbolic2(int x) {
@@ -144,8 +141,7 @@ void test_toascii_symbolic(int x) {
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
   // bugpath-note{{TRUE}} \
-  // bugpath-note{{Left side of '&&' is true}} \
-  // bugpath-note{{'x' is <= 255}}
+  // bugpath-note{{Left side of '&&' is true}}
 }
 
 void test_toascii_symbolic2(int x) {
@@ -173,8 +169,7 @@ void test_notnull_symbolic(FILE *fp, int *buf) {
   clang_analyzer_eval(buf != 0); // \
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
-  // bugpath-note{{TRUE}} \
-  // bugpath-note{{'buf' is not equal to null}}
+  // bugpath-note{{TRUE}}
 }
 void test_notnull_symbolic2(FILE *fp, int *buf) {
   if (!buf)                          // bugpath-note{{Assuming 'buf' is null}} \
@@ -218,8 +213,7 @@ void test_notnull_buffer_3(void *buf) {
   clang_analyzer_eval(buf != 0); // \
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
-  // bugpath-note{{TRUE}} \
-  // bugpath-note{{'buf' is not equal to null}}
+  // bugpath-note{{TRUE}}
 }
 
 void test_no_node_after_bug(FILE *fp, size_t size, size_t n, void *buf) {
@@ -299,8 +293,7 @@ void test_buf_size_symbolic(int s) {
   clang_analyzer_eval(s <= 3); // \
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
-  // bugpath-note{{TRUE}} \
-  // bugpath-note{{'s' is <= 3}}
+  // bugpath-note{{TRUE}}
 }
 void test_buf_size_symbolic_and_offset(int s) {
   char buf[3];
@@ -308,8 +301,7 @@ void test_buf_size_symbolic_and_offset(int s) {
   clang_analyzer_eval(s <= 2); // \
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
-  // bugpath-note{{TRUE}} \
-  // bugpath-note{{'s' is <= 2}}
+  // bugpath-note{{TRUE}}
 }
 
 int __buf_size_arg_constraint_mul(const void *, size_t, size_t);

>From c240537d15f1275a4c587ace596682edf77ff5d6 Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs at gmail.com>
Date: Sat, 5 Oct 2024 15:15:46 +0200
Subject: [PATCH 2/5] Add test from PR #109804

---
 clang/test/Analysis/out-of-bounds.c | 150 +++++++++++++++++++++++++++-
 1 file changed, 149 insertions(+), 1 deletion(-)

diff --git a/clang/test/Analysis/out-of-bounds.c b/clang/test/Analysis/out-of-bounds.c
index 1f771c2b3bd138..fde53d148ba442 100644
--- a/clang/test/Analysis/out-of-bounds.c
+++ b/clang/test/Analysis/out-of-bounds.c
@@ -1,4 +1,9 @@
-// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,alpha.security.ArrayBoundV2,debug.ExprInspection -verify %s
+// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,alpha.security.ArrayBoundV2,debug.ExprInspection -verify=expected,eagerlyassume %s
+// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,alpha.security.ArrayBoundV2,debug.ExprInspection -analyzer-config eagerly-assume=false -verify %s
+
+// Note that eagerly-assume=false is tested separately because the
+// WeakLoopAssumption suppression heuristic uses different code paths to
+// achieve the same result with and without eagerly-assume.
 
 void clang_analyzer_eval(int);
 
@@ -194,3 +199,146 @@ char test_comparison_with_extent_symbol(struct incomplete *p) {
   return ((char *)p)[-1]; // no-warning
 }
 
+// WeakLoopAssumption suppression
+///////////////////////////////////////////////////////////////////////
+
+int GlobalArray[100];
+int loop_suppress_after_zero_iterations(unsigned len) {
+  for (unsigned i = 0; i < len; i++)
+    if (GlobalArray[i] > 0)
+      return GlobalArray[i];
+  // Previously this would have produced an overflow warning because splitting
+  // the state on the loop condition introduced an execution path where the
+  // analyzer thinks that len == 0.
+  // There are many situations where the programmer knows that an argument is
+  // positive, but this is not indicated in the source code, so we must avoid
+  // reporting errors (especially out of bounds errors) on these branches,
+  // because otherwise we'd get prohibitively many false positives.
+  return GlobalArray[len - 1]; // no-warning
+}
+
+void loop_report_in_second_iteration(int len) {
+  int buf[1] = {0};
+  for (int i = 0; i < len; i++) {
+    // When a programmer writes a loop, we may assume that they intended at
+    // least two iterations.
+    buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+  }
+}
+
+void loop_suppress_in_third_iteration(int len) {
+  int buf[2] = {0};
+  for (int i = 0; i < len; i++) {
+    // We should suppress array bounds errors on the third and later iterations
+    // of loops, because sometimes programmers write a loop in sitiuations
+    // where they know that there will be at most two iterations, but the
+    // analyzer cannot deduce this property.
+    buf[i] = 1; // no-warning
+  }
+}
+
+int no_suppression_when_no_assumption_zero_iterations(unsigned len) {
+  if (len != 0) {
+    // This 'if' introduces a state split between len == 0 and len != 0.
+  }
+
+  // On the branch where we assumed that len is zero, this loop will be
+  // skipped. We (intentionally) don't suppress this execution path becasue
+  // here the analyzer doesn't assume anything new when it evaluates the loop
+  // condition.
+  for (unsigned i = 0; i < len; i++)
+    if (GlobalArray[i] > 0)
+      return GlobalArray[i];
+
+  return GlobalArray[len - 1]; // expected-warning{{Out of bound access to memory}}
+}
+
+void no_suppression_when_no_assumption_third_iteration(int len) {
+  if (len < 20) {
+    // This 'if' introduces a state split with len >= 20 on one branch.
+  }
+
+  int buf[2] = {0};
+  for (int i = 0; i < len; i++) {
+    // As in no_suppression_when_no_assumption_zero_iterations, the suppression
+    // only activates when the analyzer assumes something new in the loop
+    // condition. On the branch where `len >= 20` entering the third iteration
+    // doesn't involve a new assumption, so this warning is not suppressed:
+    buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+  }
+}
+
+void loop_suppress_in_third_iteration_logical_and(int len, int flag) {
+  int buf[2] = {0};
+  for (int i = 0; i < len && flag; i++) {
+    // FIXME: In this case the checker should suppress the warning the same way
+    // as it's suppressed in loop_suppress_in_third_iteration, but the
+    // suppression is not activated because the terminator statement associated
+    // with the loop is just the expression 'flag', while 'i < len' is a
+    // separate terminator statement that's associated with the
+    // short-circuiting operator '&&'.
+    // I have seen a real-world FP that looks like this, but it is much rarer
+    // than the basic setup.
+    buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+  }
+}
+
+void loop_suppress_in_third_iteration_logical_and_2(int len, int flag) {
+  int buf[2] = {0};
+  for (int i = 0; flag && i < len; i++) {
+    // If the two operands of '&&' are flipped, the suppression works, because
+    // then 'flag' is the terminator statement associated with '&&' (which
+    // determines whether the short-circuiting happens or not) and 'i < len' is
+    // the terminator statement of the loop itself.
+    buf[i] = 1; // no-warning
+  }
+}
+
+void loop_suppress_in_third_iteration_cast(int len) {
+  int buf[2] = {0};
+  for (int i = 0; (unsigned)(i < len); i++) {
+    // The behavior of this suppression is slightly different under
+    // eagerly-assume=true (the default) and eagerly-assume=false:
+    // * When eager assumptions are disabled, it's enough to look for cases
+    //   where we get two non-null states from splitting the state over the
+    //   'SVal' that represents the full loop condition. 
+    // * When eager assumptions are enabled, we also accept situations when the
+    //   loop condition expression triggers an eager state split and therefore
+    //   we won't see a state split at the "normal" point because it's executed
+    //   on two already separated execution paths.
+    // However, for the sake of simplicity we don't activate the suppression in
+    // cases when _a subexpression_ of the loop condition triggers an eager
+    // assumption. There are already many differences between analysis with and
+    // without eager assumptions, so it would be pointless to write more
+    // complicated code to eliminate these rare differences.
+    buf[i] = 1; // eagerlyassume-warning{{Out of bound access to memory}}
+  }
+}
+
+int coinflip(void);
+int do_while_report_after_one_iteration(void) {
+  int i = 0;
+  do {
+    i++;
+  } while (coinflip());
+  // Unlike `loop_suppress_after_zero_iterations`, running just one iteration
+  // in a do-while is not a corner case that would produce too many false
+  // positives, so don't suppress bounds errors in these situations.
+  return GlobalArray[i-2]; // expected-warning{{Out of bound access to memory}}
+}
+
+void do_while_report_in_second_iteration(int len) {
+  int buf[1] = {0};
+  int i = 0;
+  do {
+    buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+  } while (i++ < len);
+}
+
+void do_while_suppress_in_third_iteration(int len) {
+  int buf[2] = {0};
+  int i = 0;
+  do {
+    buf[i] = 1; // no-warning
+  } while (i++ < len);
+}

>From 310bb3a6aa2894ab7303582ab0bd775fbf8405ed Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs at gmail.com>
Date: Sat, 5 Oct 2024 15:16:33 +0200
Subject: [PATCH 3/5] Add SuppressReportsHavingWeakLoopAssumption visitor

---
 .../Checkers/ArrayBoundCheckerV2.cpp          | 135 +++++++++++++++++-
 1 file changed, 134 insertions(+), 1 deletion(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 3f837564cf47c4..3fcdbc6de6a068 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -11,10 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/AST/ASTContext.h"
 #include "clang/AST/CharUnits.h"
+#include "clang/AST/Expr.h"
 #include "clang/AST/ParentMapContext.h"
+#include "clang/AST/Stmt.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Analysis/ProgramPoint.h"
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Checkers/Taint.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/CheckerManager.h"
@@ -22,7 +29,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
-#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
@@ -32,6 +39,131 @@ using namespace ento;
 using namespace taint;
 using llvm::formatv;
 
+namespace {
+
+class SuppressReportsHavingWeakLoopAssumption : public BugReporterVisitor {
+private:
+  friend class BugReporterVisitor;
+  llvm::DenseSet<const Stmt *> LoopsSeen;
+  llvm::DenseMap<const Expr *, unsigned> EagerlyAssumedTrue;
+  llvm::DenseMap<const Expr *, unsigned> EagerlyAssumedFalse;
+
+  void Profile(llvm::FoldingSetNodeID &ID) const final {
+    static const bool Tag = 0;
+    ID.AddPointer(&Tag);
+  }
+
+  static bool isLoopAndNonNull(const Stmt *S) {
+    return isa_and_nonnull<ForStmt, WhileStmt, CXXForRangeStmt, DoStmt>(S);
+  }
+
+  void registerLoopStatements(const ExplodedNode *N) {
+    if (auto Entrance = N->getLocation().getAs<BlockEntrance>()) {
+      const Stmt *TermStmt = Entrance->getBlock()->getTerminatorStmt();
+      if (isLoopAndNonNull(TermStmt))
+        LoopsSeen.insert(TermStmt);
+    }
+  }
+
+  void registerOccurrence(llvm::DenseMap<const Expr *, unsigned> &Map,
+                          const Expr *Key) {
+    if (auto [Place, Inserted] = Map.try_emplace(Key, 1); !Inserted)
+      ++Place->second;
+  }
+
+  PathDiagnosticPieceRef VisitNode(const ExplodedNode *Succ,
+                                   BugReporterContext &BRC,
+                                   PathSensitiveBugReport &BR) final {
+    registerLoopStatements(Succ);
+
+    auto AsPostStmt = Succ->getLocationAs<PostStmt>();
+    const Expr *CurrExpr =
+        AsPostStmt ? dyn_cast<Expr>(AsPostStmt->getStmt()) : nullptr;
+
+    const auto *Tag = Succ->getLocation().getTag();
+    if (!Tag)
+      return nullptr;
+
+    StringRef TagDesc = Tag->getTagDescription();
+    if (TagDesc == "ExprEngine : Eagerly Assume True")
+      registerOccurrence(EagerlyAssumedTrue, CurrExpr);
+    if (TagDesc == "ExprEngine : Eagerly Assume False")
+      registerOccurrence(EagerlyAssumedFalse, CurrExpr);
+    return nullptr;
+  }
+
+  static const Expr *getCond(const Stmt *S) {
+    assert(isLoopAndNonNull(S));
+    switch (S->getStmtClass()) {
+    case Stmt::StmtClass::ForStmtClass:
+      return cast<ForStmt>(S)->getCond();
+    case Stmt::StmtClass::WhileStmtClass:
+      return cast<WhileStmt>(S)->getCond();
+    case Stmt::StmtClass::CXXForRangeStmtClass:
+      return cast<CXXForRangeStmt>(S)->getCond();
+    case Stmt::StmtClass::DoStmtClass:
+      return cast<DoStmt>(S)->getCond();
+    default:
+      return nullptr;
+    }
+  }
+
+  void trySuppressReport(PathSensitiveBugReport &R, const Stmt *Loop,
+                         const Expr *CondSubExpr) const {
+    auto NumEagerlyTrue = EagerlyAssumedTrue.lookup(CondSubExpr);
+    auto NumEagerlyFalse = EagerlyAssumedFalse.lookup(CondSubExpr);
+
+    // Suppress the report if it avoided a loop with an eager assumption.
+    if (NumEagerlyTrue == 0 && NumEagerlyFalse == 1) {
+      R.markInvalid("eagerly decided never taking the loop body", CondSubExpr);
+      return;
+    }
+
+    // Account for do-while loops where the body is already "taken"
+    // unconditionally for the first round.
+    if (isa<DoStmt>(Loop)) {
+      // Suppress the report if we have taken the loop body for the second time
+      // with an eager assumption.
+      if (NumEagerlyTrue > 1 && NumEagerlyFalse == 0) {
+        R.markInvalid("eagerly taken the do-while body at least 2 times",
+                      CondSubExpr);
+      }
+      return;
+    }
+
+    // Suppress the report if it iterates with eager assumptions 3 or more
+    // times.
+    if (NumEagerlyTrue > 2 && NumEagerlyFalse == 0) {
+      R.markInvalid("eagerly taken the loop body at least 2 times",
+                    CondSubExpr);
+    }
+  }
+
+  void finalizeVisitor(BugReporterContext &BRC, const ExplodedNode *,
+                       PathSensitiveBugReport &R) final {
+    ASTContext &Ctx = BRC.getASTContext();
+
+    // Go over all the loops we have seen (either avoided or entered), and check
+    // if the condition of such loops were eagerly decided.
+    for (const Stmt *Loop : LoopsSeen) {
+      if (const Expr *Cond = getCond(Loop)) {
+        // Let's try all sub-exprs to cover cases that use short-circuiting.
+        // We need to check if any sub-expression of the condition was eagerly
+        // decided, because the '&&' and '||' logical operators could
+        // short-circuit, thus the expression on which we recorded the
+        // "eager decision" was a sub-expression of the Loop condition.
+        using namespace ast_matchers;
+        for (auto Binding : match(findAll(expr().bind("e")), *Cond, Ctx)) {
+          trySuppressReport(R, Loop, Binding.getNodeAs<Expr>("e"));
+          if (!R.isValid())
+            return;
+        }
+      }
+    }
+  }
+};
+} // namespace
+
 namespace {
 /// If `E` is a "clean" array subscript expression, return the type of the
 /// accessed element. If the base of the subscript expression is modified by
@@ -722,6 +854,7 @@ void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
   if (Extent)
     markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
 
+  BR->addVisitor<SuppressReportsHavingWeakLoopAssumption>();
   C.emitReport(std::move(BR));
 }
 

>From 6e6ed6fabe7326f2429f84554c2a95e99127d944 Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs at gmail.com>
Date: Sat, 5 Oct 2024 16:59:45 +0200
Subject: [PATCH 4/5] Implement without using eagerly-assume tags

---
 .../Checkers/ArrayBoundCheckerV2.cpp          | 82 +++++++++++++------
 clang/test/Analysis/out-of-bounds.c           |  4 +-
 2 files changed, 58 insertions(+), 28 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 3fcdbc6de6a068..aafd44d037273d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -28,8 +28,11 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState_Fwd.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include <optional>
@@ -42,17 +45,36 @@ using llvm::formatv;
 namespace {
 
 class SuppressReportsHavingWeakLoopAssumption : public BugReporterVisitor {
+public:
+  SuppressReportsHavingWeakLoopAssumption(const PathSensitiveBugReport &R)
+      : OrigCursor(R.getErrorNode()) {}
+
 private:
   friend class BugReporterVisitor;
+
+  // The exploded node we are currently visiting, but in the original exploded
+  // graph. This graph is not trimmed, unlike the one we usually visit.
+  const ExplodedNode *OrigCursor;
+
   llvm::DenseSet<const Stmt *> LoopsSeen;
-  llvm::DenseMap<const Expr *, unsigned> EagerlyAssumedTrue;
-  llvm::DenseMap<const Expr *, unsigned> EagerlyAssumedFalse;
+  llvm::DenseMap<const Expr *, unsigned> NumTimesTakenTrueDecision;
+  llvm::DenseMap<const Expr *, unsigned> NumTimesTakenFalseDecision;
 
   void Profile(llvm::FoldingSetNodeID &ID) const final {
     static const bool Tag = 0;
     ID.AddPointer(&Tag);
   }
 
+  static const ExplodedNode *selectMatchingPred(const ExplodedNode *OrigSucc,
+                                                unsigned SoughtPredId) {
+    auto MatchesSoughtId = [SoughtPredId](const ExplodedNode *N) {
+      return N->getID() == SoughtPredId;
+    };
+    auto It = llvm::find_if(OrigSucc->preds(), MatchesSoughtId);
+    assert(It != OrigSucc->pred_end());
+    return *It;
+  }
+
   static bool isLoopAndNonNull(const Stmt *S) {
     return isa_and_nonnull<ForStmt, WhileStmt, CXXForRangeStmt, DoStmt>(S);
   }
@@ -74,21 +96,33 @@ class SuppressReportsHavingWeakLoopAssumption : public BugReporterVisitor {
   PathDiagnosticPieceRef VisitNode(const ExplodedNode *Succ,
                                    BugReporterContext &BRC,
                                    PathSensitiveBugReport &BR) final {
+    OrigCursor = selectMatchingPred(OrigCursor, Succ->getID());
+    assert(OrigCursor->getID() == Succ->getID());
+
     registerLoopStatements(Succ);
 
     auto AsPostStmt = Succ->getLocationAs<PostStmt>();
     const Expr *CurrExpr =
         AsPostStmt ? dyn_cast<Expr>(AsPostStmt->getStmt()) : nullptr;
 
-    const auto *Tag = Succ->getLocation().getTag();
-    if (!Tag)
+    bool IsSplittingTwoWays = OrigCursor->succ_size() == 2;
+
+    if (!CurrExpr || !IsSplittingTwoWays)
       return nullptr;
 
-    StringRef TagDesc = Tag->getTagDescription();
-    if (TagDesc == "ExprEngine : Eagerly Assume True")
-      registerOccurrence(EagerlyAssumedTrue, CurrExpr);
-    if (TagDesc == "ExprEngine : Eagerly Assume False")
-      registerOccurrence(EagerlyAssumedFalse, CurrExpr);
+    const ExplodedNode *Next = Succ->getFirstSucc();
+    SValBuilder &SVB = Succ->getState()->getStateManager().getSValBuilder();
+    auto Query = SVB.evalCast(Succ->getSVal(CurrExpr), SVB.getConditionType(),
+                              CurrExpr->getType())
+                     .getAs<DefinedOrUnknownSVal>();
+    if (!Query)
+      return nullptr;
+
+    ProgramStateRef TakenTrueBranch = Next->getState()->assume(*Query, true);
+    registerOccurrence(TakenTrueBranch ? NumTimesTakenTrueDecision
+                                       : NumTimesTakenFalseDecision,
+                       CurrExpr);
+
     return nullptr;
   }
 
@@ -110,30 +144,26 @@ class SuppressReportsHavingWeakLoopAssumption : public BugReporterVisitor {
 
   void trySuppressReport(PathSensitiveBugReport &R, const Stmt *Loop,
                          const Expr *CondSubExpr) const {
-    auto NumEagerlyTrue = EagerlyAssumedTrue.lookup(CondSubExpr);
-    auto NumEagerlyFalse = EagerlyAssumedFalse.lookup(CondSubExpr);
+    unsigned NumTimesTakenLoopBody =
+        NumTimesTakenTrueDecision.lookup(CondSubExpr);
+    unsigned NumTimesNotTakenLoopBody =
+        NumTimesTakenFalseDecision.lookup(CondSubExpr);
+
+    // Adjust for do-while loops, where the loop body is always taken.
+    if (isa<DoStmt>(Loop))
+      NumTimesTakenLoopBody += 1;
 
     // Suppress the report if it avoided a loop with an eager assumption.
-    if (NumEagerlyTrue == 0 && NumEagerlyFalse == 1) {
+    if (NumTimesTakenLoopBody == 0 && NumTimesNotTakenLoopBody == 1) {
+      // llvm::errs() << "eagerly decided never taking the loop body\n";
       R.markInvalid("eagerly decided never taking the loop body", CondSubExpr);
       return;
     }
 
-    // Account for do-while loops where the body is already "taken"
-    // unconditionally for the first round.
-    if (isa<DoStmt>(Loop)) {
-      // Suppress the report if we have taken the loop body for the second time
-      // with an eager assumption.
-      if (NumEagerlyTrue > 1 && NumEagerlyFalse == 0) {
-        R.markInvalid("eagerly taken the do-while body at least 2 times",
-                      CondSubExpr);
-      }
-      return;
-    }
-
     // Suppress the report if it iterates with eager assumptions 3 or more
     // times.
-    if (NumEagerlyTrue > 2 && NumEagerlyFalse == 0) {
+    if (NumTimesTakenLoopBody > 2 && NumTimesNotTakenLoopBody == 0) {
+      // llvm::errs() << "eagerly taken the loop body at least 2 times\n";
       R.markInvalid("eagerly taken the loop body at least 2 times",
                     CondSubExpr);
     }
@@ -854,7 +884,7 @@ void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
   if (Extent)
     markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
 
-  BR->addVisitor<SuppressReportsHavingWeakLoopAssumption>();
+  BR->addVisitor<SuppressReportsHavingWeakLoopAssumption>(*BR);
   C.emitReport(std::move(BR));
 }
 
diff --git a/clang/test/Analysis/out-of-bounds.c b/clang/test/Analysis/out-of-bounds.c
index fde53d148ba442..1b62b2eed4ee84 100644
--- a/clang/test/Analysis/out-of-bounds.c
+++ b/clang/test/Analysis/out-of-bounds.c
@@ -279,7 +279,7 @@ void loop_suppress_in_third_iteration_logical_and(int len, int flag) {
     // short-circuiting operator '&&'.
     // I have seen a real-world FP that looks like this, but it is much rarer
     // than the basic setup.
-    buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+    buf[i] = 1; // no-warning
   }
 }
 
@@ -311,7 +311,7 @@ void loop_suppress_in_third_iteration_cast(int len) {
     // assumption. There are already many differences between analysis with and
     // without eager assumptions, so it would be pointless to write more
     // complicated code to eliminate these rare differences.
-    buf[i] = 1; // eagerlyassume-warning{{Out of bound access to memory}}
+    buf[i] = 1; // no-warning
   }
 }
 

>From 26f02f3f5824d0c78132e46ace2b3d41f8dd68e2 Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs at gmail.com>
Date: Sat, 5 Oct 2024 16:59:55 +0200
Subject: [PATCH 5/5] Revert "Use the eager assumption tags only if the
 decision was ambiguous"

This reverts commit c429df63af6cdeec7e721e8f9c322f1ac6ba3b3d.
---
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  | 15 ++++--------
 clang/test/Analysis/assuming-unsigned-ge-0.c  |  5 +++-
 clang/test/Analysis/cast-value-notes.cpp      | 20 +++++++++-------
 clang/test/Analysis/cast-value-state-dump.cpp |  5 ++--
 .../std-c-library-functions-arg-constraints.c | 24 ++++++++++++-------
 5 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 62066632db87d0..b1919d7027cf4d 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -3771,29 +3771,24 @@ void ExprEngine::evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst,
     SVal V = state->getSVal(Ex, Pred->getLocationContext());
     std::optional<nonloc::SymbolVal> SEV = V.getAs<nonloc::SymbolVal>();
     if (SEV && SEV->isExpression()) {
+      const std::pair<const ProgramPointTag *, const ProgramPointTag*> &tags =
+        geteagerlyAssumeBinOpBifurcationTags();
+
       ProgramStateRef StateTrue, StateFalse;
       std::tie(StateTrue, StateFalse) = state->assume(*SEV);
 
-      const ProgramPointTag *EagerTrueTag = nullptr;
-      const ProgramPointTag *EagerFalseTag = nullptr;
-
-      // Use the eager assumption tags if the choice was ambiguous.
-      if (StateTrue && StateFalse)
-        std::tie(EagerTrueTag, EagerFalseTag) =
-            geteagerlyAssumeBinOpBifurcationTags();
-
       // First assume that the condition is true.
       if (StateTrue) {
         SVal Val = svalBuilder.makeIntVal(1U, Ex->getType());
         StateTrue = StateTrue->BindExpr(Ex, Pred->getLocationContext(), Val);
-        Bldr.generateNode(Ex, Pred, StateTrue, EagerTrueTag);
+        Bldr.generateNode(Ex, Pred, StateTrue, tags.first);
       }
 
       // Next, assume that the condition is false.
       if (StateFalse) {
         SVal Val = svalBuilder.makeIntVal(0U, Ex->getType());
         StateFalse = StateFalse->BindExpr(Ex, Pred->getLocationContext(), Val);
-        Bldr.generateNode(Ex, Pred, StateFalse, EagerFalseTag);
+        Bldr.generateNode(Ex, Pred, StateFalse, tags.second);
       }
     }
   }
diff --git a/clang/test/Analysis/assuming-unsigned-ge-0.c b/clang/test/Analysis/assuming-unsigned-ge-0.c
index 18e637a6e5310e..553e68cb96c6bd 100644
--- a/clang/test/Analysis/assuming-unsigned-ge-0.c
+++ b/clang/test/Analysis/assuming-unsigned-ge-0.c
@@ -2,7 +2,10 @@
 // RUN:     -analyzer-checker=core -verify %s
 
 int assuming_unsigned_ge_0(unsigned arg) {
-  // expected-note at +2 {{'arg' is >= 0}}
+  // TODO This testcase demonstrates the current incorrect behavior of Clang
+  // Static Analyzer: here 'arg' is unsigned, so "arg >= 0" is not a fresh
+  // assumption, but it still appears in the diagnostics as if it's fresh:
+  // expected-note at +2 {{Assuming 'arg' is >= 0}}
   // expected-note at +1 {{Taking false branch}}
   if (arg < 0)
     return 0;
diff --git a/clang/test/Analysis/cast-value-notes.cpp b/clang/test/Analysis/cast-value-notes.cpp
index 0235404e73adf6..7ee224dc6e5d8f 100644
--- a/clang/test/Analysis/cast-value-notes.cpp
+++ b/clang/test/Analysis/cast-value-notes.cpp
@@ -168,8 +168,9 @@ void evalNonNullParamNonNullReturnReference(const Shape &S) {
     // expected-note at -2 {{Taking true branch}}
 
     (void)(1 / !C);
-    // expected-note at -1 {{Division by zero}}
-    // expected-warning at -2 {{Division by zero}}
+    // expected-note at -1 {{'C' is non-null}}
+    // expected-note at -2 {{Division by zero}}
+    // expected-warning at -3 {{Division by zero}}
   }
 }
 
@@ -225,8 +226,9 @@ void evalNonNullParamNonNullReturn(const Shape *S) {
     // expected-note at -2 {{Taking true branch}}
 
     (void)(1 / !C);
-    // expected-note at -1 {{Division by zero}}
-    // expected-warning at -2 {{Division by zero}}
+    // expected-note at -1 {{'C' is non-null}}
+    // expected-note at -2 {{Division by zero}}
+    // expected-warning at -3 {{Division by zero}}
   }
 }
 
@@ -241,8 +243,9 @@ void evalNonNullParamNullReturn(const Shape *S) {
     // expected-note at -4 {{Taking true branch}}
 
     (void)(1 / !T);
-    // expected-note at -1 {{Division by zero}}
-    // expected-warning at -2 {{Division by zero}}
+    // expected-note at -1 {{'T' is non-null}}
+    // expected-note at -2 {{Division by zero}}
+    // expected-warning at -3 {{Division by zero}}
   }
 }
 
@@ -262,8 +265,9 @@ void evalZeroParamNonNullReturnPointer(const Shape *S) {
   // expected-note at -2 {{'C' initialized here}}
 
   (void)(1 / !C);
-  // expected-note at -1 {{Division by zero}}
-  // expected-warning at -2 {{Division by zero}}
+  // expected-note at -1 {{'C' is non-null}}
+  // expected-note at -2 {{Division by zero}}
+  // expected-warning at -3 {{Division by zero}}
 }
 
 void evalZeroParamNonNullReturn(const Shape &S) {
diff --git a/clang/test/Analysis/cast-value-state-dump.cpp b/clang/test/Analysis/cast-value-state-dump.cpp
index 97f3b2236db087..07fd7abd848ab7 100644
--- a/clang/test/Analysis/cast-value-state-dump.cpp
+++ b/clang/test/Analysis/cast-value-state-dump.cpp
@@ -40,7 +40,8 @@ void evalNonNullParamNonNullReturn(const Shape *S) {
   // CHECK-NEXT:   ] }
 
   (void)(1 / !C);
-  // expected-note at -1 {{Division by zero}}
-  // expected-warning at -2 {{Division by zero}}
+  // expected-note at -1 {{'C' is non-null}}
+  // expected-note at -2 {{Division by zero}}
+  // expected-warning at -3 {{Division by zero}}
 }
 
diff --git a/clang/test/Analysis/std-c-library-functions-arg-constraints.c b/clang/test/Analysis/std-c-library-functions-arg-constraints.c
index a6e0ad15129f60..0b817dda98c727 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-constraints.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-constraints.c
@@ -42,7 +42,8 @@ void test_alnum_symbolic(int x) {
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
   // bugpath-note{{TRUE}} \
-  // bugpath-note{{Left side of '&&' is true}}
+  // bugpath-note{{Left side of '&&' is true}} \
+  // bugpath-note{{'x' is <= 255}}
 }
 
 void test_alnum_symbolic2(int x) {
@@ -75,7 +76,8 @@ void test_toupper_symbolic(int x) {
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
   // bugpath-note{{TRUE}} \
-  // bugpath-note{{Left side of '&&' is true}}
+  // bugpath-note{{Left side of '&&' is true}} \
+  // bugpath-note{{'x' is <= 255}}
 }
 
 void test_toupper_symbolic2(int x) {
@@ -108,7 +110,8 @@ void test_tolower_symbolic(int x) {
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
   // bugpath-note{{TRUE}} \
-  // bugpath-note{{Left side of '&&' is true}}
+  // bugpath-note{{Left side of '&&' is true}} \
+  // bugpath-note{{'x' is <= 255}}
 }
 
 void test_tolower_symbolic2(int x) {
@@ -141,7 +144,8 @@ void test_toascii_symbolic(int x) {
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
   // bugpath-note{{TRUE}} \
-  // bugpath-note{{Left side of '&&' is true}}
+  // bugpath-note{{Left side of '&&' is true}} \
+  // bugpath-note{{'x' is <= 255}}
 }
 
 void test_toascii_symbolic2(int x) {
@@ -169,7 +173,8 @@ void test_notnull_symbolic(FILE *fp, int *buf) {
   clang_analyzer_eval(buf != 0); // \
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
-  // bugpath-note{{TRUE}}
+  // bugpath-note{{TRUE}} \
+  // bugpath-note{{'buf' is not equal to null}}
 }
 void test_notnull_symbolic2(FILE *fp, int *buf) {
   if (!buf)                          // bugpath-note{{Assuming 'buf' is null}} \
@@ -213,7 +218,8 @@ void test_notnull_buffer_3(void *buf) {
   clang_analyzer_eval(buf != 0); // \
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
-  // bugpath-note{{TRUE}}
+  // bugpath-note{{TRUE}} \
+  // bugpath-note{{'buf' is not equal to null}}
 }
 
 void test_no_node_after_bug(FILE *fp, size_t size, size_t n, void *buf) {
@@ -293,7 +299,8 @@ void test_buf_size_symbolic(int s) {
   clang_analyzer_eval(s <= 3); // \
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
-  // bugpath-note{{TRUE}}
+  // bugpath-note{{TRUE}} \
+  // bugpath-note{{'s' is <= 3}}
 }
 void test_buf_size_symbolic_and_offset(int s) {
   char buf[3];
@@ -301,7 +308,8 @@ void test_buf_size_symbolic_and_offset(int s) {
   clang_analyzer_eval(s <= 2); // \
   // report-warning{{TRUE}} \
   // bugpath-warning{{TRUE}} \
-  // bugpath-note{{TRUE}}
+  // bugpath-note{{TRUE}} \
+  // bugpath-note{{'s' is <= 2}}
 }
 
 int __buf_size_arg_constraint_mul(const void *, size_t, size_t);