[clang-tools-extra] [llvm] [clang] [analyzer] Support interestingness in ArrayBoundV2 (PR #78315)

via cfe-commits cfe-commits at lists.llvm.org
Mon Jan 29 13:07:04 PST 2024


=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/78315 at github.com>


https://github.com/NagyDonat updated https://github.com/llvm/llvm-project/pull/78315

>From c75c05c6e894a46797913c5bdccb240cbcc01ae9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Tue, 12 Dec 2023 13:07:54 +0100
Subject: [PATCH 01/16] [analyzer] Support interestingness in ArrayBoundV2

This commit improves alpha.security.ArrayBoundV2 in two connected areas:
(1) It calls `markInteresting()` on the symbolic values that are
responsible for the out of bounds access.
(2) Its index-is-in-bounds assumptions are reported in note tags if they
provide information about the value of an interesting symbol.

This commit is limited to "display" changes: it introduces new
diagnostic pieces (potentially to bugs found by other checkers), but the
ArrayBoundV2 will make the same assumptions and detect the same bugs
before and after this change.

As a minor unrelated change, this commit also updates/removes some very
old comments which became obsolate due to my previous changes.
---
 .../Checkers/ArrayBoundCheckerV2.cpp          | 345 ++++++++++++++----
 .../test/Analysis/out-of-bounds-diagnostics.c |  10 +
 clang/test/Analysis/out-of-bounds-notes.c     | 128 +++++++
 3 files changed, 413 insertions(+), 70 deletions(-)
 create mode 100644 clang/test/Analysis/out-of-bounds-notes.c

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 6c7a1601402efac..4241fa3a2ce8af6 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -33,7 +33,66 @@ using namespace taint;
 using llvm::formatv;
 
 namespace {
-enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint };
+class StateUpdateReporter {
+  const SubRegion *Reg;
+  NonLoc ByteOffsetVal;
+  std::optional<QualType> ElementType = std::nullopt;
+  std::optional<int64_t> ElementSize = std::nullopt;
+  bool AssumedNonNegative = false;
+  std::optional<NonLoc> AssumedUpperBound = std::nullopt;
+
+public:
+  StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
+                      CheckerContext &C)
+      : Reg(R), ByteOffsetVal(ByteOffsVal) {
+    initializeElementInfo(E, C);
+  }
+
+  void initializeElementInfo(const Expr *E, CheckerContext &C) {
+    if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) {
+      const MemRegion *SubscriptBaseReg =
+          C.getSVal(ASE->getBase()).getAsRegion();
+      if (!SubscriptBaseReg)
+        return;
+      SubscriptBaseReg = SubscriptBaseReg->StripCasts();
+      if (!isa_and_nonnull<ElementRegion>(SubscriptBaseReg)) {
+        ElementType = ASE->getType();
+        ElementSize =
+            C.getASTContext().getTypeSizeInChars(*ElementType).getQuantity();
+      }
+    }
+  }
+  void recordNonNegativeAssumption() { AssumedNonNegative = true; }
+  void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
+    AssumedUpperBound = UpperBoundVal;
+  }
+
+  const NoteTag *createNoteTag(CheckerContext &C) const;
+
+private:
+  std::string getMessage(PathSensitiveBugReport &BR) const;
+
+  /// Return true if information about the value of `Sym` can put constraints
+  /// on some symbol which is interesting within the bug report `BR`.
+  /// In particular, this returns true when `Sym` is interesting within `BR`;
+  /// but it also returns true if `Sym` is an expression that contains integer
+  /// constants and a single symbolic operand which is interesting (in `BR`).
+  /// We need to use this instead of plain `BR.isInteresting()` because if we
+  /// are analyzing code like
+  ///   int array[10];
+  ///   int f(int arg) {
+  ///     return array[arg] && array[arg + 10];
+  ///   }
+  /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
+  /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
+  /// to detect this out of bounds access).
+  static bool providesInformationAboutInteresting(SymbolRef Sym,
+                                                  PathSensitiveBugReport &BR);
+  static bool providesInformationAboutInteresting(SVal SV,
+                                                  PathSensitiveBugReport &BR) {
+    return providesInformationAboutInteresting(SV.getAsSymbol(), BR);
+  }
+};
 
 struct Messages {
   std::string Short, Full;
@@ -54,11 +113,14 @@ class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
 
   void performCheck(const Expr *E, CheckerContext &C) const;
 
-  void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind,
-                 NonLoc Offset, Messages Msgs) const;
+  void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
+                 NonLoc Offset, bool IsTaintBug = false) const;
 
   static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
 
+  static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
+                                       NonLoc Offset, NonLoc Limit,
+                                       CheckerContext &C);
   static bool isInAddressOf(const Stmt *S, ASTContext &AC);
 
 public:
@@ -133,12 +195,19 @@ computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
   return std::nullopt;
 }
 
-// TODO: once the constraint manager is smart enough to handle non simplified
-// symbolic expressions remove this function. Note that this can not be used in
-// the constraint manager as is, since this does not handle overflows. It is
-// safe to assume, however, that memory offsets will not overflow.
-// NOTE: callers of this function need to be aware of the effects of overflows
-// and signed<->unsigned conversions!
+// NOTE: This function is the "heart" of this checker. It simplifies
+// inequalities with transformations that are valid (and very elementary) in
+// pure mathematics, but become invalid if we use them in C++ number model
+// where the calculations may overflow.
+// Due to the overflow issues I think it's impossible (or at least not
+// practical) to integrate this kind of simplification into the resolution of
+// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
+// produces valid results if the arguments are memory offsets which are known
+// to be << SIZE_MAX.
+// TODO: This algorithm should be moved to a central location where it's
+// available for other checkers that need to compare memory offsets.
+// NOTE: When using the results of this function, don't forget that `evalBinOp`
+// uses the evaluation rules of C++, so e.g. `(size_t)123 < -1`!
 static std::pair<NonLoc, nonloc::ConcreteInt>
 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
                      SValBuilder &svalBuilder) {
@@ -239,13 +308,8 @@ static std::optional<int64_t> getConcreteValue(NonLoc SV) {
   return std::nullopt;
 }
 
-static std::string getShortMsg(OOB_Kind Kind, std::string RegName) {
-  static const char *ShortMsgTemplates[] = {
-      "Out of bound access to memory preceding {0}",
-      "Out of bound access to memory after the end of {0}",
-      "Potential out of bound access to {0} with tainted offset"};
-
-  return formatv(ShortMsgTemplates[Kind], RegName);
+static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
+  return SV ? getConcreteValue(*SV) : std::nullopt;
 }
 
 static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
@@ -255,7 +319,28 @@ static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
   Out << "Access of " << RegName << " at negative byte offset";
   if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
     Out << ' ' << ConcreteIdx->getValue();
-  return {getShortMsg(OOB_Precedes, RegName), std::string(Buf)};
+  return {formatv("Out of bound access to memory preceding {0}", RegName),
+          std::string(Buf)};
+}
+
+/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
+/// it can be performed (`Divisor` is nonzero and there is no remainder). The
+/// values `Val1` and `Val2` may be nullopt and in that case the corresponding
+/// division is considered to be successful.
+bool tryDividePair(std::optional<int64_t> &Val1, std::optional<int64_t> &Val2,
+                   int64_t Divisor) {
+  if (!Divisor)
+    return false;
+  const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
+  const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
+  if (!Val1HasRemainder && !Val2HasRemainder) {
+    if (Val1)
+      *Val1 /= Divisor;
+    if (Val2)
+      *Val2 /= Divisor;
+    return true;
+  }
+  return false;
 }
 
 static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
@@ -268,18 +353,9 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
   std::optional<int64_t> OffsetN = getConcreteValue(Offset);
   std::optional<int64_t> ExtentN = getConcreteValue(Extent);
 
-  bool UseByteOffsets = true;
-  if (int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity()) {
-    const bool OffsetHasRemainder = OffsetN && *OffsetN % ElemSize;
-    const bool ExtentHasRemainder = ExtentN && *ExtentN % ElemSize;
-    if (!OffsetHasRemainder && !ExtentHasRemainder) {
-      UseByteOffsets = false;
-      if (OffsetN)
-        *OffsetN /= ElemSize;
-      if (ExtentN)
-        *ExtentN /= ElemSize;
-    }
-  }
+  int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
+
+  bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
 
   SmallString<256> Buf;
   llvm::raw_svector_ostream Out(Buf);
@@ -307,7 +383,9 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
       Out << "s";
   }
 
-  return {getShortMsg(OOB_Exceeds, RegName), std::string(Buf)};
+  return {
+      formatv("Out of bound access to memory after the end of {0}", RegName),
+      std::string(Buf)};
 }
 
 static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
@@ -318,17 +396,87 @@ static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
                   RegName, OffsetName)};
 }
 
-void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
-  // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping
-  // some new logic here that reasons directly about memory region extents.
-  // Once that logic is more mature, we can bring it back to assumeInBound()
-  // for all clients to use.
-  //
-  // The algorithm we are using here for bounds checking is to see if the
-  // memory access is within the extent of the base region.  Since we
-  // have some flexibility in defining the base region, we can achieve
-  // various levels of conservatism in our buffer overflow checking.
+const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
+  // Don't create a note tag if we didn't assume anything:
+  if (!AssumedNonNegative && !AssumedUpperBound)
+    return nullptr;
+
+  return C.getNoteTag(
+      [*this](PathSensitiveBugReport &BR) -> std::string {
+        return getMessage(BR);
+      },
+      /*isPrunable=*/false);
+}
+
+std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
+  bool ShouldReportNonNegative = AssumedNonNegative;
+  if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) {
+    if (AssumedUpperBound &&
+        providesInformationAboutInteresting(*AssumedUpperBound, BR))
+      ShouldReportNonNegative = false;
+    else
+      return "";
+  }
+
+  std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal);
+  std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound);
 
+  const bool UseIndex =
+      ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize);
+
+  SmallString<256> Buf;
+  llvm::raw_svector_ostream Out(Buf);
+  Out << "Assuming ";
+  if (UseIndex) {
+    Out << "index ";
+    if (OffsetN)
+      Out << "'" << OffsetN << "' ";
+  } else if (AssumedUpperBound) {
+    Out << "byte offset ";
+    if (OffsetN)
+      Out << "'" << OffsetN << "' ";
+  } else {
+    Out << "offset ";
+  }
+
+  Out << "is";
+  if (ShouldReportNonNegative) {
+    Out << " non-negative";
+  }
+  if (AssumedUpperBound) {
+    if (ShouldReportNonNegative)
+      Out << " and";
+    Out << " less than ";
+    if (ExtentN)
+      Out << *ExtentN << ", ";
+    if (UseIndex && ElementType)
+      Out << "the number of '" << ElementType->getAsString()
+          << "' elements in ";
+    else
+      Out << "the extent of ";
+    Out << getRegionName(Reg);
+  }
+  return std::string(Out.str());
+}
+
+bool StateUpdateReporter::providesInformationAboutInteresting(
+    SymbolRef Sym, PathSensitiveBugReport &BR) {
+  if (!Sym)
+    return false;
+  for (SymbolRef PartSym : Sym->symbols()) {
+    // The interestingess mark may appear on any layer as we're stripping off
+    // the SymIntExpr, UnarySymExpr etc. layers...
+    if (BR.isInteresting(PartSym))
+      return true;
+    // ...but if both sides of the expression are symbolic (i.e. unknown), then
+    // the analyzer can't use the combined result to constrain the operands.
+    if (isa<SymSymExpr>(PartSym))
+      return false;
+  }
+  return false;
+}
+
+void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
   const SVal Location = C.getSVal(E);
 
   // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
@@ -350,6 +498,10 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
 
   auto [Reg, ByteOffset] = *RawOffset;
 
+  // The state updates will be reported as a single note tag, which will be
+  // composed by this helper class.
+  StateUpdateReporter SUR(Reg, ByteOffset, E, C);
+
   // CHECK LOWER BOUND
   const MemSpaceRegion *Space = Reg->getMemorySpace();
   if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
@@ -363,13 +515,22 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
     auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
         State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
 
-    if (PrecedesLowerBound && !WithinLowerBound) {
-      // We know that the index definitely precedes the lower bound.
-      Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
-      reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, Msgs);
-      return;
+    if (PrecedesLowerBound) {
+      // The offset may be invalid (negative)...
+      if (!WithinLowerBound) {
+        // ...and it cannot be valid (>= 0), so report an error.
+        Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
+        reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset);
+        return;
+      }
+      // ...but it can be valid as well, so the checker will (optimistically)
+      // assume that it's valid and mention this in the note tag.
+      SUR.recordNonNegativeAssumption();
     }
 
+    // Actually update the state. The "if" only fails in the extremely unlikely
+    // case when compareValueToThreshold returns {nullptr, nullptr} becasue
+    // evalBinOpNN fails to evaluate the less-than operator.
     if (WithinLowerBound)
       State = WithinLowerBound;
   }
@@ -381,32 +542,30 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
         compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
 
     if (ExceedsUpperBound) {
+      // The offset may be invalid (>= Size)...
       if (!WithinUpperBound) {
-        // We know that the index definitely exceeds the upper bound.
-        if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
-          // ...but this is within an addressof expression, so we need to check
-          // for the exceptional case that `&array[size]` is valid.
-          auto [EqualsToThreshold, NotEqualToThreshold] =
-              compareValueToThreshold(ExceedsUpperBound, ByteOffset, *KnownSize,
-                                      SVB, /*CheckEquality=*/true);
-          if (EqualsToThreshold && !NotEqualToThreshold) {
-            // We are definitely in the exceptional case, so return early
-            // instead of reporting a bug.
-            C.addTransition(EqualsToThreshold);
-            return;
-          }
+        // ...and it cannot be within bounds, so report an error, unless we can
+        // definitely determine that this is an idiomatic `&array[size]`
+        // expression that calculates the past-the-end pointer.
+        if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
+                                     *KnownSize, C)) {
+          // FIXME: this duplicates the `addTransition` at the end of the
+          // function, but `goto` is taboo nowdays.
+          C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
+          return;
         }
+
         Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
                                        *KnownSize, Location);
-        reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs);
+        reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset);
         return;
       }
+      // ...and it can be valid as well...
       if (isTainted(State, ByteOffset)) {
-        // Both cases are possible, but the offset is tainted, so report.
-        std::string RegName = getRegionName(Reg);
+        // ...but it's tainted, so report an error.
 
-        // Diagnostic detail: "tainted offset" is always correct, but the
-        // common case is that 'idx' is tainted in 'arr[idx]' and then it's
+        // Diagnostic detail: saying "tainted offset" is always correct, but
+        // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
         // nicer to say "tainted index".
         const char *OffsetName = "offset";
         if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
@@ -414,33 +573,67 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
             OffsetName = "index";
 
         Messages Msgs = getTaintMsgs(Reg, OffsetName);
-        reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, Msgs);
+        reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, /*IsTaintBug=*/true);
         return;
       }
+      // ...and it isn't tainted, so the checker will (optimistically) assume
+      // that the offset is in bounds and mention this in the note tag.
+      SUR.recordUpperBoundAssumption(*KnownSize);
     }
 
+    // Actually update the state. The "if" only fails in the extremely unlikely
+    // case when compareValueToThreshold returns {nullptr, nullptr} becasue
+    // evalBinOpNN fails to evaluate the less-than operator.
     if (WithinUpperBound)
       State = WithinUpperBound;
   }
 
-  C.addTransition(State);
+  // Add a transition, reporting the state updates that we accumulated.
+  C.addTransition(State, SUR.createNoteTag(C));
 }
 
 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
-                                    ProgramStateRef ErrorState, OOB_Kind Kind,
-                                    NonLoc Offset, Messages Msgs) const {
+                                    ProgramStateRef ErrorState, Messages Msgs,
+                                    NonLoc Offset, bool IsTaintBug) const {
 
   ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
   if (!ErrorNode)
     return;
 
   auto BR = std::make_unique<PathSensitiveBugReport>(
-      Kind == OOB_Taint ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
+      IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
+
+  // FIXME: ideally we would just call trackExpressionValue() and that would
+  // "do the right thing": mark the relevant symbols as interesting, track the
+  // control dependencies and statements storing the relevant values and add
+  // helpful diagnostic pieces. However, right now trackExpressionValue() is
+  // a heap of unreliable heuristics, so it would cause several issues:
+  // - Interestingness is not applied consistently, e.g. if `array[x+10]`
+  //   causes an overflow, then `x` is not marked as interesting.
+  // - We get irrelevant diagnostic pieces, e.g. in the code
+  //   `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
+  //   it places a "Storing uninitialized value" note on the `malloc` call
+  //   (which is technically true, but irrelevant).
+  // If trackExpressionValue() becomes reliable, it should be applied instead
+  // of the manual markInteresting() calls.
+
+  if (SymbolRef OffsetSym = Offset.getAsSymbol()) {
+    // If the offset is a symbolic value, iterate over its "parts" with
+    // `SymExpr::symbols()` and mark each of them as interesting.
+    // For example, if the offset is `x*4 + y` then we put interestingness onto
+    // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
+    // `x` and `y`.
+    for (SymbolRef PartSym : OffsetSym->symbols())
+      BR->markInteresting(PartSym);
+  }
 
-  // Track back the propagation of taintedness.
-  if (Kind == OOB_Taint)
+  if (IsTaintBug) {
+    // If the issue that we're reporting depends on the taintedness of the
+    // offset, then put interestingness onto symbols that could be the origin
+    // of the taint.
     for (SymbolRef Sym : getTaintedSymbols(ErrorState, Offset))
       BR->markInteresting(Sym);
+  }
 
   C.emitReport(std::move(BR));
 }
@@ -476,6 +669,18 @@ bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
   return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
 }
 
+bool ArrayBoundCheckerV2::isIdiomaticPastTheEndPtr(const Expr *E,
+                                                   ProgramStateRef State,
+                                                   NonLoc Offset, NonLoc Limit,
+                                                   CheckerContext &C) {
+  if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
+    auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
+        State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true);
+    return EqualsToThreshold && !NotEqualToThreshold;
+  }
+  return false;
+}
+
 void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
   mgr.registerChecker<ArrayBoundCheckerV2>();
 }
diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index 769a8954f796691..7a8f59650fef161 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -195,12 +195,22 @@ void *malloc(size_t size);
 
 int *mallocRegion(void) {
   int *mem = (int*)malloc(2*sizeof(int));
+
   mem[3] = -2;
   // expected-warning at -1 {{Out of bound access to memory after the end of the heap area}}
   // expected-note at -2 {{Access of the heap area at index 3, while it holds only 2 'int' elements}}
   return mem;
 }
 
+int *mallocRegionDeref(void) {
+  int *mem = (int*)malloc(2*sizeof(int));
+
+  *(mem + 3) = -2;
+  // expected-warning at -1 {{Out of bound access to memory after the end of the heap area}}
+  // expected-note at -2 {{Access of the heap area at index 3, while it holds only 2 'int' elements}}
+  return mem;
+}
+
 void *alloca(size_t size);
 
 int allocaRegion(void) {
diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c
new file mode 100644
index 000000000000000..290df79f3e18a41
--- /dev/null
+++ b/clang/test/Analysis/out-of-bounds-notes.c
@@ -0,0 +1,128 @@
+// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-output=text        \
+// RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint,debug.ExprInspection -verify %s
+
+int array[10];
+
+void clang_analyzer_value(unsigned arg);
+
+int irrelevantAssumptions(int arg) {
+  int a = array[arg];
+  // Here the analyzer assumes that `arg` is in bounds, but doesn't report this
+  // because `arg` is not interesting for the bug.
+  int b = array[13];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at index 13, while it holds only 10 'int' elements}}
+  return a + b;
+}
+
+
+int assumingBoth(int arg) {
+  int a = array[arg];
+  // expected-note at -1 {{Assuming index is non-negative and less than 10, the number of 'int' elements in 'array'}}
+  int b = array[arg]; // no additional note, we already assumed that 'arg' is in bounds
+  int c = array[arg + 10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  return a + b + c;
+}
+
+int assumingLower(int arg) {
+  // expected-note at +2 {{Assuming 'arg' is < 10}}
+  // expected-note at +1 {{Taking false branch}}
+  if (arg >= 10)
+    return 0;
+  int a = array[arg];
+  // expected-note at -1 {{Assuming index is non-negative}}
+  int b = array[arg + 10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  return a + b;
+}
+
+int assumingUpper(int arg) {
+  // expected-note at +2 {{Assuming 'arg' is >= 0}}
+  // expected-note at +1 {{Taking false branch}}
+  if (arg < 0)
+    return 0;
+  int a = array[arg];
+  // expected-note at -1 {{Assuming index is less than 10, the number of 'int' elements in 'array'}}
+  int b = array[arg - 10];
+  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
+  // expected-note at -2 {{Access of 'array' at negative byte offset}}
+  return a + b;
+}
+
+int assumingUpperIrrelevant(int arg) {
+  // FIXME: The assumption "assuming index is less than 10" is printed because
+  // it's assuming something about the interesting variable `arg`; however,
+  // it's irrelevant because in this testcase the out of bound access is
+  // deduced from the _lower_ bound on `arg`. Currently the analyzer cannot
+  // filter out assumptions that are logically irrelevant but "touch"
+  // interesting symbols; eventually it would be good to add support for this.
+
+  // expected-note at +2 {{Assuming 'arg' is >= 0}}
+  // expected-note at +1 {{Taking false branch}}
+  if (arg < 0)
+    return 0;
+  int a = array[arg];
+  // expected-note at -1 {{Assuming index is less than 10, the number of 'int' elements in 'array'}}
+  int b = array[arg + 10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  return a + b;
+}
+
+int assumingUpperUnsigned(unsigned arg) {
+  int a = array[arg];
+  // expected-note at -1 {{Assuming index is less than 10, the number of 'int' elements in 'array'}}
+  int b = array[(int)arg - 10];
+  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
+  // expected-note at -2 {{Access of 'array' at negative byte offset}}
+  return a + b;
+}
+
+int assumingNothing(unsigned arg) {
+  // expected-note at +2 {{Assuming 'arg' is < 10}}
+  // expected-note at +1 {{Taking false branch}}
+  if (arg >= 10)
+    return 0;
+  int a = array[arg]; // no note here, we already know that 'arg' is in bounds
+  int b = array[(int)arg - 10];
+  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
+  // expected-note at -2 {{Access of 'array' at negative byte offset}}
+  return a + b;
+}
+
+short assumingConverted(int arg) {
+  // When indices are reported, the note will use the element type that's the
+  // result type of the subscript operator.
+  char *cp = (char*)array;
+  char a = cp[arg];
+  // expected-note at -1 {{Assuming index is non-negative and less than 40, the number of 'char' elements in 'array'}}
+  char b = cp[arg]; // no additional note, we already assumed that 'arg' is in bounds
+  char c = cp[arg + 40];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 40 'char' elements}}
+  return a + b + c;
+}
+
+struct foo {
+  int num;
+  char a[8];
+  char b[5];
+};
+
+int assumingConverted2(struct foo f, int arg) {
+  // When indices are reported, the note will use the element type that's the
+  // result type of the subscript operator.
+  int a = ((int*)(f.a))[arg];
+  // expected-note at -1 {{Assuming index is non-negative and less than 2, the number of 'int' elements in 'f.a'}}
+  // However, if the extent of the memory region is not divisible by the
+  // element size, the checker measures the offset and extent in bytes.
+  int b = ((int*)(f.b))[arg];
+  // expected-note at -1 {{Assuming byte offset is less than 5, the extent of 'f.b'}}
+  int c = array[arg-2];
+  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
+  // expected-note at -2 {{Access of 'array' at negative byte offset}}
+  return a + b + c;
+}

>From c0a09db0fbb48626bbb069f85a587d2c94a8998a Mon Sep 17 00:00:00 2001
From: NagyDonat <donat.nagy at ericsson.com>
Date: Fri, 19 Jan 2024 19:10:54 +0100
Subject: [PATCH 02/16] Remove unused reference to debug.ExprInspection

---
 clang/test/Analysis/out-of-bounds-notes.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c
index 290df79f3e18a41..3070ab643d832ea 100644
--- a/clang/test/Analysis/out-of-bounds-notes.c
+++ b/clang/test/Analysis/out-of-bounds-notes.c
@@ -1,10 +1,8 @@
 // RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-output=text        \
-// RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint,debug.ExprInspection -verify %s
+// RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint -verify %s
 
 int array[10];
 
-void clang_analyzer_value(unsigned arg);
-
 int irrelevantAssumptions(int arg) {
   int a = array[arg];
   // Here the analyzer assumes that `arg` is in bounds, but doesn't report this

>From 18c00f79123ed8b22feefdb3b4bcc2f08b59c79b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Mon, 22 Jan 2024 15:26:57 +0100
Subject: [PATCH 03/16] Improve comments (as suggested on code review)

---
 .../Checkers/ArrayBoundCheckerV2.cpp          | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 4241fa3a2ce8af6..593095a2bebafef 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -202,12 +202,19 @@ computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
 // Due to the overflow issues I think it's impossible (or at least not
 // practical) to integrate this kind of simplification into the resolution of
 // arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
-// produces valid results if the arguments are memory offsets which are known
-// to be << SIZE_MAX.
+// produces valid results when the calculations are handling memory offsets
+// and every value is well below SIZE_MAX.
 // TODO: This algorithm should be moved to a central location where it's
 // available for other checkers that need to compare memory offsets.
-// NOTE: When using the results of this function, don't forget that `evalBinOp`
-// uses the evaluation rules of C++, so e.g. `(size_t)123 < -1`!
+// NOTE: the simplification preserves the order of the two operands in a
+// mathematical sense, but it may change the result produced by a C++
+// comparison operator (and the automatic type conversions).
+// For example, consider a comparison "X+1 < 0", where the LHS is stored as a
+// size_t and the RHS is stored in an int. (As size_t is unsigned, this
+// comparison is false for all values of "X".) However, the simplification may
+// turn it into "X < -1", which is still always false in a mathematical sense,
+// but can produce a true result when evaluated by `evalBinOp` (which follows
+// the rules of C++ and casts -1 to SIZE_MAX).
 static std::pair<NonLoc, nonloc::ConcreteInt>
 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
                      SValBuilder &svalBuilder) {
@@ -401,11 +408,9 @@ const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
   if (!AssumedNonNegative && !AssumedUpperBound)
     return nullptr;
 
-  return C.getNoteTag(
-      [*this](PathSensitiveBugReport &BR) -> std::string {
-        return getMessage(BR);
-      },
-      /*isPrunable=*/false);
+  return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string {
+    return getMessage(BR);
+  });
 }
 
 std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
@@ -468,8 +473,9 @@ bool StateUpdateReporter::providesInformationAboutInteresting(
     // the SymIntExpr, UnarySymExpr etc. layers...
     if (BR.isInteresting(PartSym))
       return true;
-    // ...but if both sides of the expression are symbolic (i.e. unknown), then
-    // the analyzer can't use the combined result to constrain the operands.
+    // ...but if both sides of the expression are symbolic, then there is no
+    // practical algorithm to produce separate constraints for the two
+    // operands (from the single combined result).
     if (isa<SymSymExpr>(PartSym))
       return false;
   }
@@ -549,8 +555,6 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
         // expression that calculates the past-the-end pointer.
         if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
                                      *KnownSize, C)) {
-          // FIXME: this duplicates the `addTransition` at the end of the
-          // function, but `goto` is taboo nowdays.
           C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
           return;
         }

>From 6ed41d51078796983983b777af6067056de78299 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Mon, 22 Jan 2024 15:56:30 +0100
Subject: [PATCH 04/16] Make a function 'static'

---
 clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 593095a2bebafef..e6f0cc0cbf0dd31 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -334,8 +334,8 @@ static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
 /// it can be performed (`Divisor` is nonzero and there is no remainder). The
 /// values `Val1` and `Val2` may be nullopt and in that case the corresponding
 /// division is considered to be successful.
-bool tryDividePair(std::optional<int64_t> &Val1, std::optional<int64_t> &Val2,
-                   int64_t Divisor) {
+static bool tryDividePair(std::optional<int64_t> &Val1,
+                          std::optional<int64_t> &Val2, int64_t Divisor) {
   if (!Divisor)
     return false;
   const bool Val1HasRemainder = Val1 && *Val1 % Divisor;

>From 1413b2cce38859e867f23e65d418b97a112093c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Mon, 22 Jan 2024 17:11:08 +0100
Subject: [PATCH 05/16] Add testcase 'AssumingPlainOffset'

...and give more meaningful names to two testcases before it.

Currently this TC doesn't demonstrate the intended behavior because the
simplification algorithm used by the checker is somewhat limited and it
doesn't perform divisions when the RHS is not divisible by the
multiplier that's present in LHS. This will be fixed in a separate PR
that improves the simplification algorithm.
---
 clang/test/Analysis/out-of-bounds-notes.c | 26 +++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c
index 3070ab643d832ea..be69526393cd483 100644
--- a/clang/test/Analysis/out-of-bounds-notes.c
+++ b/clang/test/Analysis/out-of-bounds-notes.c
@@ -91,7 +91,7 @@ int assumingNothing(unsigned arg) {
   return a + b;
 }
 
-short assumingConverted(int arg) {
+short assumingConvertedToCharP(int arg) {
   // When indices are reported, the note will use the element type that's the
   // result type of the subscript operator.
   char *cp = (char*)array;
@@ -110,7 +110,7 @@ struct foo {
   char b[5];
 };
 
-int assumingConverted2(struct foo f, int arg) {
+int assumingConvertedToIntP(struct foo f, int arg) {
   // When indices are reported, the note will use the element type that's the
   // result type of the subscript operator.
   int a = ((int*)(f.a))[arg];
@@ -124,3 +124,25 @@ int assumingConverted2(struct foo f, int arg) {
   // expected-note at -2 {{Access of 'array' at negative byte offset}}
   return a + b + c;
 }
+
+int assumingPlainOffset(struct foo f, int arg) {
+  // This TC is intended to check the corner case that the checker prints the
+  // shorter "offset" instead of "byte offset" when it's irrelevant that the
+  // offset is measured in bytes.
+
+  // expected-note at +2 {{Assuming 'arg' is < 2}}
+  // expected-note at +1 {{Taking false branch}}
+  if (arg >= 2)
+    return 0;
+
+  int b = ((int*)(f.b))[arg];
+  // expected-note at -1 {{Assuming byte offset is non-negative and less than 5, the extent of 'f.b'}}
+  // FIXME: this should be {{Assuming offset is non-negative}}
+  // but the current simplification algorithm doesn't realize that arg <= 1
+  // implies that the byte offset arg*4 will be less than 5.
+
+  int c = array[arg+10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  return b + c;
+}

>From 6f6dc734d9b0b163f93189ebf094d03d16dd48c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Mon, 22 Jan 2024 17:35:57 +0100
Subject: [PATCH 06/16] Explain a corner case that's currently non-functional

---
 .../StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index e6f0cc0cbf0dd31..002d0315bc65972 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -417,10 +417,19 @@ std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
   bool ShouldReportNonNegative = AssumedNonNegative;
   if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) {
     if (AssumedUpperBound &&
-        providesInformationAboutInteresting(*AssumedUpperBound, BR))
+        providesInformationAboutInteresting(*AssumedUpperBound, BR)) {
+      // Even if the byte offset isn't interesting (e.g. it's a constant value),
+      // the assumption can still be interesting if it provides information about
+      // an interesting symbolic upper bound.
+      // FIXME: This code path is currently non-functional and untested because
+      // `getSimplifiedOffsets()` only works when the RHS (extent) is constant.
+
+      // In this case don't display the "offset is non-negative" assumption
+      // (because the offset isn't interesting).
       ShouldReportNonNegative = false;
-    else
+    } else {
       return "";
+    }
   }
 
   std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal);

>From a57cbf2834b9ce3041de6435c9e7ec1b244a2832 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Mon, 22 Jan 2024 17:43:40 +0100
Subject: [PATCH 07/16] Satisfy git-clang-format

---
 clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 002d0315bc65972..24247f340317be4 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -419,8 +419,8 @@ std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
     if (AssumedUpperBound &&
         providesInformationAboutInteresting(*AssumedUpperBound, BR)) {
       // Even if the byte offset isn't interesting (e.g. it's a constant value),
-      // the assumption can still be interesting if it provides information about
-      // an interesting symbolic upper bound.
+      // the assumption can still be interesting if it provides information
+      // about an interesting symbolic upper bound.
       // FIXME: This code path is currently non-functional and untested because
       // `getSimplifiedOffsets()` only works when the RHS (extent) is constant.
 

>From 399deca38538d5b682ff1307a42460fa706cc78e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Thu, 25 Jan 2024 17:31:52 +0100
Subject: [PATCH 08/16] Workaround to partially resolve issues with symbolic
 extent

It seems that constraints on symbols used as the dynamic extent of an
allocated area can be garbage collected when there are no other
references. This commit adds a temporary `(void)variable` hack in
`out-of-bounds-diagnostics.c` to keep the relevant symbols alive and
ensure that the testcase can validate the behavior of
ArrayBoundCheckerV2.

In addition, the misleading names of some testcases are fixed.
---
 .../test/Analysis/out-of-bounds-diagnostics.c | 37 ++++++++++++++++---
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index 7a8f59650fef161..786962fc1b708c3 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -221,18 +221,38 @@ int allocaRegion(void) {
   return *mem;
 }
 
-int *unknownExtent(int arg) {
-  if (arg >= 2)
+int *symbolicExtent(int arg) {
+  // expected-note at +2 {{Assuming 'arg' is < 5}}
+  // expected-note at +1 {{Taking false branch}}
+  if (arg >= 5)
     return 0;
   int *mem = (int*)malloc(arg);
+
+  // TODO: without the following reference to 'arg', the analyzer would discard
+  // the range information about (the symbolic value of) 'arg'. This is
+  // incorrect because while the variable itself is inaccessible, it becomes
+  // the symbolic extent of 'mem', so we still want to reason about its
+  // potential values.
+  (void)arg;
+
   mem[8] = -2;
-  // FIXME: this should produce
-  //   {{Out of bound access to memory after the end of the heap area}}
-  //   {{Access of 'int' element in the heap area at index 8}}
+  // expected-warning at -1 {{Out of bound access to memory after the end of the heap area}}
+  // expected-note at -2 {{Access of 'int' element in the heap area at index 8}}
   return mem;
 }
 
-void unknownIndex(int arg) {
+int *symbolicExtentDiscardedRangeInfo(int arg) {
+  // This is a copy of the case 'symbolicExtent' without the '(void)arg' hack.
+  // TODO: if the analyzer can detect the out-of-bounds access within this TC,
+  // then remove this TC and the `(void)arg` hack from `symbolicExtent`.
+  if (arg >= 5)
+    return 0;
+  int *mem = (int*)malloc(arg);
+  mem[8] = -2;
+  return mem;
+}
+
+void symbolicIndex(int arg) {
   // expected-note at +2 {{Assuming 'arg' is >= 12}}
   // expected-note at +1 {{Taking true branch}}
   if (arg >= 12)
@@ -245,10 +265,15 @@ int *nothingIsCertain(int x, int y) {
   if (x >= 2)
     return 0;
   int *mem = (int*)malloc(x);
+
+  // TODO: The next line is a temporary hack; see 'symbolicExtent()' for details.
+  (void)x;
+
   if (y >= 8)
     mem[y] = -2;
   // FIXME: this should produce
   //   {{Out of bound access to memory after the end of the heap area}}
   //   {{Access of 'int' element in the heap area at an overflowing index}}
+  // but apparently the analyzer isn't smart enough to deduce this.
   return mem;
 }

>From 1fb1ce411d1788951b0a97befafc0550880bc374 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Fri, 26 Jan 2024 12:41:37 +0100
Subject: [PATCH 09/16] Move initialization of StateUpdateReporter to static
 function

...`const`ify some data members and add comments to clarify some obscure
goals of this code.
---
 .../Checkers/ArrayBoundCheckerV2.cpp          | 53 ++++++++++++-------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 24247f340317be4..3299863e1a5ae6a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -33,35 +33,48 @@ using namespace taint;
 using llvm::formatv;
 
 namespace {
+/// If `E` is a "clean" array subscript expression, return the type of the
+/// accessed element. If the base of the subscript expression is modified by
+/// pointer arithmetic (and not the beginning of a "full" memory region), this
+/// always returns nullopt because that's the right (or the least bad) thing to
+/// do for the diagnostic output that's relying on this.
+static std::optional<QualType> determineElementType(const Expr *E, const CheckerContext &C) {
+  const auto *ASE = dyn_cast<ArraySubscriptExpr>(E);
+  if (!ASE)
+    return std::nullopt;
+
+  const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion();
+  if (!SubscriptBaseReg)
+    return std::nullopt;
+
+  // The base of the subscript expression is affected by pointer arithmetics,
+  // so we want to report byte offsets instead of indices.
+  if (isa<ElementRegion>(SubscriptBaseReg->StripCasts()))
+    return std::nullopt;
+
+  return ASE->getType();
+}
+
+
+static std::optional<int64_t> determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
+  if (!T)
+    return std::nullopt;
+  return C.getASTContext().getTypeSizeInChars(*T).getQuantity();
+}
+
 class StateUpdateReporter {
   const SubRegion *Reg;
-  NonLoc ByteOffsetVal;
-  std::optional<QualType> ElementType = std::nullopt;
-  std::optional<int64_t> ElementSize = std::nullopt;
+  const NonLoc ByteOffsetVal;
+  const std::optional<QualType> ElementType;
+  const std::optional<int64_t> ElementSize;
   bool AssumedNonNegative = false;
   std::optional<NonLoc> AssumedUpperBound = std::nullopt;
 
 public:
   StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
                       CheckerContext &C)
-      : Reg(R), ByteOffsetVal(ByteOffsVal) {
-    initializeElementInfo(E, C);
-  }
+      : Reg(R), ByteOffsetVal(ByteOffsVal), ElementType(determineElementType(E, C)), ElementSize(determineElementSize(ElementType, C)) {}
 
-  void initializeElementInfo(const Expr *E, CheckerContext &C) {
-    if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) {
-      const MemRegion *SubscriptBaseReg =
-          C.getSVal(ASE->getBase()).getAsRegion();
-      if (!SubscriptBaseReg)
-        return;
-      SubscriptBaseReg = SubscriptBaseReg->StripCasts();
-      if (!isa_and_nonnull<ElementRegion>(SubscriptBaseReg)) {
-        ElementType = ASE->getType();
-        ElementSize =
-            C.getASTContext().getTypeSizeInChars(*ElementType).getQuantity();
-      }
-    }
-  }
   void recordNonNegativeAssumption() { AssumedNonNegative = true; }
   void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
     AssumedUpperBound = UpperBoundVal;

>From b949ef90fb474a26f4e0f954b7149f5f94498114 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Fri, 26 Jan 2024 12:52:37 +0100
Subject: [PATCH 10/16] Add testcase assumingBothPointerToMiddle

---
 clang/test/Analysis/out-of-bounds-notes.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c
index be69526393cd483..6dcba98c3be518f 100644
--- a/clang/test/Analysis/out-of-bounds-notes.c
+++ b/clang/test/Analysis/out-of-bounds-notes.c
@@ -24,6 +24,20 @@ int assumingBoth(int arg) {
   return a + b + c;
 }
 
+int assumingBothPointerToMiddle(int arg) {
+  // If we're accessing an array through a pointer pointing to its middle, the checker
+  // will speak about the "byte offset" measured from the beginning of the array.
+  int *p = array + 2;
+  int a = p[arg];
+  // expected-note at -1 {{Assuming byte offset is non-negative and less than 40, the extent of 'array'}}
+  int b = array[arg]; // This is normal access, and only the lower bound is new.
+  // expected-note at -1 {{Assuming index is non-negative}}
+  int c = array[arg + 10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  return a + b + c;
+}
+
 int assumingLower(int arg) {
   // expected-note at +2 {{Assuming 'arg' is < 10}}
   // expected-note at +1 {{Taking false branch}}

>From 24a355d36015e9904bbce6613b30b7387ab194c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Fri, 26 Jan 2024 19:05:44 +0100
Subject: [PATCH 11/16] Add a FIXME note in assumingBothPointerToMiddle

Unfortunately this testcase does not work because there is YET ANOTHER
arbitrary limitation in the clang SA engine.
---
 clang/test/Analysis/out-of-bounds-notes.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c
index 6dcba98c3be518f..b409af94d576f9d 100644
--- a/clang/test/Analysis/out-of-bounds-notes.c
+++ b/clang/test/Analysis/out-of-bounds-notes.c
@@ -29,7 +29,12 @@ int assumingBothPointerToMiddle(int arg) {
   // will speak about the "byte offset" measured from the beginning of the array.
   int *p = array + 2;
   int a = p[arg];
-  // expected-note at -1 {{Assuming byte offset is non-negative and less than 40, the extent of 'array'}}
+  // FIXME: The following note does not appear:
+  //  {{Assuming byte offset is non-negative and less than 40, the extent of 'array'}}
+  // It seems that the analyzer "gives up" modeling this pointer arithmetics and says that
+  // `p[arg]` is just an UnknownVal (instead of calculating that it's
+  // equivalent to `array[2+arg]`).
+
   int b = array[arg]; // This is normal access, and only the lower bound is new.
   // expected-note at -1 {{Assuming index is non-negative}}
   int c = array[arg + 10];

>From e2b420cba8841d5149cf96378a968c33e29bf3f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Fri, 26 Jan 2024 19:19:19 +0100
Subject: [PATCH 12/16] Rename example array `array` to `TenElements`

...for the convenience of a reader who looks at a random testcase in the
middle of the test file.
---
 .../test/Analysis/out-of-bounds-diagnostics.c |  93 +++++++--------
 clang/test/Analysis/out-of-bounds-notes.c     | 110 +++++++++---------
 2 files changed, 102 insertions(+), 101 deletions(-)

diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index 786962fc1b708c3..845aa7afa2377ef 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -1,20 +1,20 @@
 // RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-output=text        \
 // RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint -verify %s
 
-int array[10];
+int TenElements[10];
 
 void arrayUnderflow(void) {
-  array[-3] = 5;
-  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
-  // expected-note at -2 {{Access of 'array' at negative byte offset -12}}
+  TenElements[-3] = 5;
+  // expected-warning at -1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at negative byte offset -12}}
 }
 
 int underflowWithDeref(void) {
-  int *p = array;
+  int *p = TenElements;
   --p;
   return *p;
-  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
-  // expected-note at -2 {{Access of 'array' at negative byte offset -4}}
+  // expected-warning at -1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at negative byte offset -4}}
 }
 
 int scanf(const char *restrict fmt, ...);
@@ -24,30 +24,31 @@ void taintedIndex(void) {
   scanf("%d", &index);
   // expected-note at -1 {{Taint originated here}}
   // expected-note at -2 {{Taint propagated to the 2nd argument}}
-  array[index] = 5;
-  // expected-warning at -1 {{Potential out of bound access to 'array' with tainted index}}
-  // expected-note at -2 {{Access of 'array' with a tainted index that may be too large}}
+  TenElements[index] = 5;
+  // expected-warning at -1 {{Potential out of bound access to 'TenElements' with tainted index}}
+  // expected-note at -2 {{Access of 'TenElements' with a tainted index that may be too large}}
 }
 
 int *taintedIndexAfterTheEndPtr(void) {
   // NOTE: Technically speaking, this testcase does not trigger any UB because
-  // &array[10] is the after-the-end pointer which is well-defined; but this is
-  // a bug-prone situation and far from the idiomatic use of `&array[size]`, so
-  // it's better to report an error. This report can be easily silenced by
-  // writing array+index instead of &array[index].
+  // &TenElements[10] is the after-the-end pointer which is well-defined; but
+  // this is a bug-prone situation and far from the idiomatic use of
+  // `&TenElements[size]`, so it's better to report an error. This report can
+  // be easily silenced by writing TenElements+index instead of
+  // &TenElements[index].
   int index;
   scanf("%d", &index);
   // expected-note at -1 {{Taint originated here}}
   // expected-note at -2 {{Taint propagated to the 2nd argument}}
   if (index < 0 || index > 10)
-    return array;
+    return TenElements;
   // expected-note at -2 {{Assuming 'index' is >= 0}}
   // expected-note at -3 {{Left side of '||' is false}}
   // expected-note at -4 {{Assuming 'index' is <= 10}}
   // expected-note at -5 {{Taking false branch}}
-  return &array[index];
-  // expected-warning at -1 {{Potential out of bound access to 'array' with tainted index}}
-  // expected-note at -2 {{Access of 'array' with a tainted index that may be too large}}
+  return &TenElements[index];
+  // expected-warning at -1 {{Potential out of bound access to 'TenElements' with tainted index}}
+  // expected-note at -2 {{Access of 'TenElements' with a tainted index that may be too large}}
 }
 
 void taintedOffset(void) {
@@ -55,57 +56,57 @@ void taintedOffset(void) {
   scanf("%d", &index);
   // expected-note at -1 {{Taint originated here}}
   // expected-note at -2 {{Taint propagated to the 2nd argument}}
-  int *p = array + index;
+  int *p = TenElements + index;
   p[0] = 5;
-  // expected-warning at -1 {{Potential out of bound access to 'array' with tainted offset}}
-  // expected-note at -2 {{Access of 'array' with a tainted offset that may be too large}}
+  // expected-warning at -1 {{Potential out of bound access to 'TenElements' with tainted offset}}
+  // expected-note at -2 {{Access of 'TenElements' with a tainted offset that may be too large}}
 }
 
 void arrayOverflow(void) {
-  array[12] = 5;
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at index 12, while it holds only 10 'int' elements}}
+  TenElements[12] = 5;
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at index 12, while it holds only 10 'int' elements}}
 }
 
 void flippedOverflow(void) {
-  12[array] = 5;
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at index 12, while it holds only 10 'int' elements}}
+  12[TenElements] = 5;
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at index 12, while it holds only 10 'int' elements}}
 }
 
 int *afterTheEndPtr(void) {
-  // This is an unusual but standard-compliant way of writing (array + 10).
-  return &array[10]; // no-warning
+  // This is an unusual but standard-compliant way of writing (TenElements + 10).
+  return &TenElements[10]; // no-warning
 }
 
 int useAfterTheEndPtr(void) {
   // ... but dereferencing the after-the-end pointer is still invalid.
   return *afterTheEndPtr();
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at index 10, while it holds only 10 'int' elements}}
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at index 10, while it holds only 10 'int' elements}}
 }
 
 int *afterAfterTheEndPtr(void) {
   // This is UB, it's invalid to form an after-after-the-end pointer.
-  return &array[11];
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at index 11, while it holds only 10 'int' elements}}
+  return &TenElements[11];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at index 11, while it holds only 10 'int' elements}}
 }
 
 int *potentialAfterTheEndPtr(int idx) {
   if (idx < 10) { /* ...do something... */ }
   // expected-note at -1 {{Assuming 'idx' is >= 10}}
   // expected-note at -2 {{Taking false branch}}
-  return &array[idx];
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  return &TenElements[idx];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
   // NOTE: On the idx >= 10 branch the normal "optimistic" behavior would've
   // been continuing with the assumption that idx == 10 and the return value is
   // a legitimate after-the-end pointer. The checker deviates from this by
   // reporting an error because this situation is very suspicious and far from
-  // the idiomatic `&array[size]` expressions. If the report is FP, the
-  // developer can easily silence it by writing array+idx instead of
-  // &array[idx].
+  // the idiomatic `&TenElements[size]` expressions. If the report is FP, the
+  // developer can easily silence it by writing TenElements+idx instead of
+  // &TenElements[idx].
 }
 
 int scalar;
@@ -156,9 +157,9 @@ int arrayOfStructsArrow(void) {
 }
 
 short convertedArray(void) {
-  return ((short*)array)[47];
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at index 47, while it holds only 20 'short' elements}}
+  return ((short*)TenElements)[47];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at index 47, while it holds only 20 'short' elements}}
 }
 
 struct two_bytes {
@@ -256,9 +257,9 @@ void symbolicIndex(int arg) {
   // expected-note at +2 {{Assuming 'arg' is >= 12}}
   // expected-note at +1 {{Taking true branch}}
   if (arg >= 12)
-    array[arg] = -2;
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+    TenElements[arg] = -2;
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
 }
 
 int *nothingIsCertain(int x, int y) {
diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c
index b409af94d576f9d..1fc6f89b6276e1d 100644
--- a/clang/test/Analysis/out-of-bounds-notes.c
+++ b/clang/test/Analysis/out-of-bounds-notes.c
@@ -1,45 +1,45 @@
 // RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-output=text        \
 // RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint -verify %s
 
-int array[10];
+int TenElements[10];
 
 int irrelevantAssumptions(int arg) {
-  int a = array[arg];
+  int a = TenElements[arg];
   // Here the analyzer assumes that `arg` is in bounds, but doesn't report this
   // because `arg` is not interesting for the bug.
-  int b = array[13];
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at index 13, while it holds only 10 'int' elements}}
+  int b = TenElements[13];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at index 13, while it holds only 10 'int' elements}}
   return a + b;
 }
 
 
 int assumingBoth(int arg) {
-  int a = array[arg];
-  // expected-note at -1 {{Assuming index is non-negative and less than 10, the number of 'int' elements in 'array'}}
-  int b = array[arg]; // no additional note, we already assumed that 'arg' is in bounds
-  int c = array[arg + 10];
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  int a = TenElements[arg];
+  // expected-note at -1 {{Assuming index is non-negative and less than 10, the number of 'int' elements in 'TenElements'}}
+  int b = TenElements[arg]; // no additional note, we already assumed that 'arg' is in bounds
+  int c = TenElements[arg + 10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
   return a + b + c;
 }
 
 int assumingBothPointerToMiddle(int arg) {
-  // If we're accessing an array through a pointer pointing to its middle, the checker
-  // will speak about the "byte offset" measured from the beginning of the array.
-  int *p = array + 2;
+  // If we're accessing an TenElements through a pointer pointing to its middle, the checker
+  // will speak about the "byte offset" measured from the beginning of the TenElements.
+  int *p = TenElements + 2;
   int a = p[arg];
   // FIXME: The following note does not appear:
-  //  {{Assuming byte offset is non-negative and less than 40, the extent of 'array'}}
-  // It seems that the analyzer "gives up" modeling this pointer arithmetics and says that
-  // `p[arg]` is just an UnknownVal (instead of calculating that it's
-  // equivalent to `array[2+arg]`).
+  //  {{Assuming byte offset is non-negative and less than 40, the extent of 'TenElements'}}
+  // It seems that the analyzer "gives up" modeling this pointer arithmetics
+  // and says that `p[arg]` is just an UnknownVal (instead of calculating that
+  // it's equivalent to `TenElements[2+arg]`).
 
-  int b = array[arg]; // This is normal access, and only the lower bound is new.
+  int b = TenElements[arg]; // This is normal access, and only the lower bound is new.
   // expected-note at -1 {{Assuming index is non-negative}}
-  int c = array[arg + 10];
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  int c = TenElements[arg + 10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
   return a + b + c;
 }
 
@@ -48,11 +48,11 @@ int assumingLower(int arg) {
   // expected-note at +1 {{Taking false branch}}
   if (arg >= 10)
     return 0;
-  int a = array[arg];
+  int a = TenElements[arg];
   // expected-note at -1 {{Assuming index is non-negative}}
-  int b = array[arg + 10];
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  int b = TenElements[arg + 10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
   return a + b;
 }
 
@@ -61,11 +61,11 @@ int assumingUpper(int arg) {
   // expected-note at +1 {{Taking false branch}}
   if (arg < 0)
     return 0;
-  int a = array[arg];
-  // expected-note at -1 {{Assuming index is less than 10, the number of 'int' elements in 'array'}}
-  int b = array[arg - 10];
-  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
-  // expected-note at -2 {{Access of 'array' at negative byte offset}}
+  int a = TenElements[arg];
+  // expected-note at -1 {{Assuming index is less than 10, the number of 'int' elements in 'TenElements'}}
+  int b = TenElements[arg - 10];
+  // expected-warning at -1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at negative byte offset}}
   return a + b;
 }
 
@@ -81,20 +81,20 @@ int assumingUpperIrrelevant(int arg) {
   // expected-note at +1 {{Taking false branch}}
   if (arg < 0)
     return 0;
-  int a = array[arg];
-  // expected-note at -1 {{Assuming index is less than 10, the number of 'int' elements in 'array'}}
-  int b = array[arg + 10];
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  int a = TenElements[arg];
+  // expected-note at -1 {{Assuming index is less than 10, the number of 'int' elements in 'TenElements'}}
+  int b = TenElements[arg + 10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
   return a + b;
 }
 
 int assumingUpperUnsigned(unsigned arg) {
-  int a = array[arg];
-  // expected-note at -1 {{Assuming index is less than 10, the number of 'int' elements in 'array'}}
-  int b = array[(int)arg - 10];
-  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
-  // expected-note at -2 {{Access of 'array' at negative byte offset}}
+  int a = TenElements[arg];
+  // expected-note at -1 {{Assuming index is less than 10, the number of 'int' elements in 'TenElements'}}
+  int b = TenElements[(int)arg - 10];
+  // expected-warning at -1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at negative byte offset}}
   return a + b;
 }
 
@@ -103,23 +103,23 @@ int assumingNothing(unsigned arg) {
   // expected-note at +1 {{Taking false branch}}
   if (arg >= 10)
     return 0;
-  int a = array[arg]; // no note here, we already know that 'arg' is in bounds
-  int b = array[(int)arg - 10];
-  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
-  // expected-note at -2 {{Access of 'array' at negative byte offset}}
+  int a = TenElements[arg]; // no note here, we already know that 'arg' is in bounds
+  int b = TenElements[(int)arg - 10];
+  // expected-warning at -1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at negative byte offset}}
   return a + b;
 }
 
 short assumingConvertedToCharP(int arg) {
   // When indices are reported, the note will use the element type that's the
   // result type of the subscript operator.
-  char *cp = (char*)array;
+  char *cp = (char*)TenElements;
   char a = cp[arg];
-  // expected-note at -1 {{Assuming index is non-negative and less than 40, the number of 'char' elements in 'array'}}
+  // expected-note at -1 {{Assuming index is non-negative and less than 40, the number of 'char' elements in 'TenElements'}}
   char b = cp[arg]; // no additional note, we already assumed that 'arg' is in bounds
   char c = cp[arg + 40];
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 40 'char' elements}}
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at an overflowing index, while it holds only 40 'char' elements}}
   return a + b + c;
 }
 
@@ -138,9 +138,9 @@ int assumingConvertedToIntP(struct foo f, int arg) {
   // element size, the checker measures the offset and extent in bytes.
   int b = ((int*)(f.b))[arg];
   // expected-note at -1 {{Assuming byte offset is less than 5, the extent of 'f.b'}}
-  int c = array[arg-2];
-  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
-  // expected-note at -2 {{Access of 'array' at negative byte offset}}
+  int c = TenElements[arg-2];
+  // expected-warning at -1 {{Out of bound access to memory preceding 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at negative byte offset}}
   return a + b + c;
 }
 
@@ -160,8 +160,8 @@ int assumingPlainOffset(struct foo f, int arg) {
   // but the current simplification algorithm doesn't realize that arg <= 1
   // implies that the byte offset arg*4 will be less than 5.
 
-  int c = array[arg+10];
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  int c = TenElements[arg+10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
   return b + c;
 }

>From 1e04ac56dd3072a15c77d47a5c58328b19dfe413 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Fri, 26 Jan 2024 19:27:45 +0100
Subject: [PATCH 13/16] Tweak comments related to the constraint hack

---
 clang/test/Analysis/out-of-bounds-diagnostics.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index 845aa7afa2377ef..0c3c67c6a546ad2 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -244,8 +244,8 @@ int *symbolicExtent(int arg) {
 
 int *symbolicExtentDiscardedRangeInfo(int arg) {
   // This is a copy of the case 'symbolicExtent' without the '(void)arg' hack.
-  // TODO: if the analyzer can detect the out-of-bounds access within this TC,
-  // then remove this TC and the `(void)arg` hack from `symbolicExtent`.
+  // TODO: if the analyzer can detect the out-of-bounds access within this
+  // testcase, then remove this and the `(void)arg` hack from `symbolicExtent`.
   if (arg >= 5)
     return 0;
   int *mem = (int*)malloc(arg);
@@ -267,14 +267,16 @@ int *nothingIsCertain(int x, int y) {
     return 0;
   int *mem = (int*)malloc(x);
 
-  // TODO: The next line is a temporary hack; see 'symbolicExtent()' for details.
-  (void)x;
-
   if (y >= 8)
     mem[y] = -2;
   // FIXME: this should produce
   //   {{Out of bound access to memory after the end of the heap area}}
   //   {{Access of 'int' element in the heap area at an overflowing index}}
   // but apparently the analyzer isn't smart enough to deduce this.
+
+  // Keep constraints alive. (Without this, the overeager garbage collection of
+  // constraints would _also_ prevent the intended behavior in this testcase.)
+  (void)x;
+
   return mem;
 }

>From be9b2e749e4264357d6d8db1dfa3d05bf8974e43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Fri, 26 Jan 2024 19:37:40 +0100
Subject: [PATCH 14/16] Oops, I forgot git-clang-format again

It's Friday...
---
 .../StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp   | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 3299863e1a5ae6a..614276525ff4c46 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -38,7 +38,8 @@ namespace {
 /// pointer arithmetic (and not the beginning of a "full" memory region), this
 /// always returns nullopt because that's the right (or the least bad) thing to
 /// do for the diagnostic output that's relying on this.
-static std::optional<QualType> determineElementType(const Expr *E, const CheckerContext &C) {
+static std::optional<QualType> determineElementType(const Expr *E,
+                                                    const CheckerContext &C) {
   const auto *ASE = dyn_cast<ArraySubscriptExpr>(E);
   if (!ASE)
     return std::nullopt;
@@ -55,8 +56,8 @@ static std::optional<QualType> determineElementType(const Expr *E, const Checker
   return ASE->getType();
 }
 
-
-static std::optional<int64_t> determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
+static std::optional<int64_t>
+determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
   if (!T)
     return std::nullopt;
   return C.getASTContext().getTypeSizeInChars(*T).getQuantity();
@@ -73,7 +74,9 @@ class StateUpdateReporter {
 public:
   StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
                       CheckerContext &C)
-      : Reg(R), ByteOffsetVal(ByteOffsVal), ElementType(determineElementType(E, C)), ElementSize(determineElementSize(ElementType, C)) {}
+      : Reg(R), ByteOffsetVal(ByteOffsVal),
+        ElementType(determineElementType(E, C)),
+        ElementSize(determineElementSize(ElementType, C)) {}
 
   void recordNonNegativeAssumption() { AssumedNonNegative = true; }
   void recordUpperBoundAssumption(NonLoc UpperBoundVal) {

>From 001670895b694fe865d51ddd2b8249109a7a5123 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Mon, 29 Jan 2024 10:15:38 +0100
Subject: [PATCH 15/16] Mark the symbolic array extent as interesting (when
 it's relevant)

---
 .../Checkers/ArrayBoundCheckerV2.cpp          | 62 ++++++++++++-------
 1 file changed, 38 insertions(+), 24 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 614276525ff4c46..bbe2e3ca5fe05bf 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -130,7 +130,12 @@ class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
   void performCheck(const Expr *E, CheckerContext &C) const;
 
   void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
-                 NonLoc Offset, bool IsTaintBug = false) const;
+                 NonLoc Offset, std::optional<NonLoc> Extent,
+                 bool IsTaintBug = false) const;
+
+  static void markPartsInteresting(PathSensitiveBugReport &BR,
+                                   ProgramStateRef ErrorState, NonLoc Val,
+                                   bool MarkTaint);
 
   static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
 
@@ -551,7 +556,7 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
       if (!WithinLowerBound) {
         // ...and it cannot be valid (>= 0), so report an error.
         Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
-        reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset);
+        reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
         return;
       }
       // ...but it can be valid as well, so the checker will (optimistically)
@@ -586,7 +591,7 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
 
         Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
                                        *KnownSize, Location);
-        reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset);
+        reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
         return;
       }
       // ...and it can be valid as well...
@@ -602,7 +607,8 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
             OffsetName = "index";
 
         Messages Msgs = getTaintMsgs(Reg, OffsetName);
-        reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, /*IsTaintBug=*/true);
+        reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize,
+                  /*IsTaintBug=*/true);
         return;
       }
       // ...and it isn't tainted, so the checker will (optimistically) assume
@@ -621,9 +627,32 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
   C.addTransition(State, SUR.createNoteTag(C));
 }
 
+void ArrayBoundCheckerV2::markPartsInteresting(PathSensitiveBugReport &BR,
+                                               ProgramStateRef ErrorState,
+                                               NonLoc Val, bool MarkTaint) {
+  if (SymbolRef Sym = Val.getAsSymbol()) {
+    // If the offset is a symbolic value, iterate over its "parts" with
+    // `SymExpr::symbols()` and mark each of them as interesting.
+    // For example, if the offset is `x*4 + y` then we put interestingness onto
+    // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
+    // `x` and `y`.
+    for (SymbolRef PartSym : Sym->symbols())
+      BR.markInteresting(PartSym);
+  }
+
+  if (MarkTaint) {
+    // If the issue that we're reporting depends on the taintedness of the
+    // offset, then put interestingness onto symbols that could be the origin
+    // of the taint.
+    for (SymbolRef Sym : getTaintedSymbols(ErrorState, Val))
+      BR.markInteresting(Sym);
+  }
+}
+
 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
                                     ProgramStateRef ErrorState, Messages Msgs,
-                                    NonLoc Offset, bool IsTaintBug) const {
+                                    NonLoc Offset, std::optional<NonLoc> Extent,
+                                    bool IsTaintBug) const {
 
   ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
   if (!ErrorNode)
@@ -644,25 +673,10 @@ void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
   //   it places a "Storing uninitialized value" note on the `malloc` call
   //   (which is technically true, but irrelevant).
   // If trackExpressionValue() becomes reliable, it should be applied instead
-  // of the manual markInteresting() calls.
-
-  if (SymbolRef OffsetSym = Offset.getAsSymbol()) {
-    // If the offset is a symbolic value, iterate over its "parts" with
-    // `SymExpr::symbols()` and mark each of them as interesting.
-    // For example, if the offset is `x*4 + y` then we put interestingness onto
-    // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
-    // `x` and `y`.
-    for (SymbolRef PartSym : OffsetSym->symbols())
-      BR->markInteresting(PartSym);
-  }
-
-  if (IsTaintBug) {
-    // If the issue that we're reporting depends on the taintedness of the
-    // offset, then put interestingness onto symbols that could be the origin
-    // of the taint.
-    for (SymbolRef Sym : getTaintedSymbols(ErrorState, Offset))
-      BR->markInteresting(Sym);
-  }
+  // of this custom markPartsInteresting().
+  markPartsInteresting(*BR, ErrorState, Offset, IsTaintBug);
+  if (Extent)
+    markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
 
   C.emitReport(std::move(BR));
 }

>From 183893323baeb41968de01e65d3f06e257854d9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Mon, 29 Jan 2024 22:05:26 +0100
Subject: [PATCH 16/16] Add tests for the 'interesting extent' case

---
 .../Checkers/ArrayBoundCheckerV2.cpp          |  6 +---
 clang/test/Analysis/out-of-bounds-notes.c     | 32 +++++++++++++++++++
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index bbe2e3ca5fe05bf..2e465f76c488af8 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -442,13 +442,9 @@ std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
       // Even if the byte offset isn't interesting (e.g. it's a constant value),
       // the assumption can still be interesting if it provides information
       // about an interesting symbolic upper bound.
-      // FIXME: This code path is currently non-functional and untested because
-      // `getSimplifiedOffsets()` only works when the RHS (extent) is constant.
-
-      // In this case don't display the "offset is non-negative" assumption
-      // (because the offset isn't interesting).
       ShouldReportNonNegative = false;
     } else {
+      // We don't have anything interesting, don't report the assumption.
       return "";
     }
   }
diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c
index 1fc6f89b6276e1d..c29b6f8ab111b4f 100644
--- a/clang/test/Analysis/out-of-bounds-notes.c
+++ b/clang/test/Analysis/out-of-bounds-notes.c
@@ -165,3 +165,35 @@ int assumingPlainOffset(struct foo f, int arg) {
   // expected-note at -2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
   return b + c;
 }
+
+typedef __typeof(sizeof(int)) size_t;
+void *malloc(size_t size);
+void free(void *ptr);
+
+int assumingExtent(int arg) {
+  // Verify that the assumption note is printed when the extent is interesting
+  // (even if the index isn't interesting).
+  int *mem = (int*)malloc(arg);
+
+  mem[12] = 123;
+  // expected-note at -1 {{Assuming index '12' is less than the number of 'int' elements in the heap area}}
+
+  free(mem);
+
+  return TenElements[arg];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'TenElements'}}
+  // expected-note at -2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}}
+}
+
+int *extentInterestingness(int arg) {
+  // Verify that in an out-of-bounds access issue the extent is marked as
+  // interesting (so assumptions about its value are printed).
+  int *mem = (int*)malloc(arg);
+
+  TenElements[arg] = 123;
+  // expected-note at -1 {{Assuming index is non-negative and less than 10, the number of 'int' elements in 'TenElements'}}
+
+  return &mem[12];
+  // expected-warning at -1 {{Out of bound access to memory after the end of the heap area}}
+  // expected-note at -2 {{Access of 'int' element in the heap area at index 12}}
+}



More information about the cfe-commits mailing list