[llvm] [clang-tools-extra] [clang] [analyzer] Support interestingness in ArrayBoundV2 (PR #78315)

Mon Jan 22 08:14:40 PST 2024

=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/78315 at github.com>


https://github.com/NagyDonat updated https://github.com/llvm/llvm-project/pull/78315

>From c75c05c6e894a46797913c5bdccb240cbcc01ae9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Tue, 12 Dec 2023 13:07:54 +0100
Subject: [PATCH 1/5] [analyzer] Support interestingness in ArrayBoundV2

This commit improves alpha.security.ArrayBoundV2 in two connected areas:
(1) It calls `markInteresting()` on the symbolic values that are
responsible for the out of bounds access.
(2) Its index-is-in-bounds assumptions are reported in note tags if they
provide information about the value of an interesting symbol.

This commit is limited to "display" changes: it introduces new
diagnostic pieces (potentially to bugs found by other checkers), but the
ArrayBoundV2 will make the same assumptions and detect the same bugs
before and after this change.

As a minor unrelated change, this commit also updates/removes some very
old comments which became obsolate due to my previous changes.
---
 .../Checkers/ArrayBoundCheckerV2.cpp          | 345 ++++++++++++++----
 .../test/Analysis/out-of-bounds-diagnostics.c |  10 +
 clang/test/Analysis/out-of-bounds-notes.c     | 128 +++++++
 3 files changed, 413 insertions(+), 70 deletions(-)
 create mode 100644 clang/test/Analysis/out-of-bounds-notes.c

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 6c7a1601402efac..4241fa3a2ce8af6 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -33,7 +33,66 @@ using namespace taint;
 using llvm::formatv;
 
 namespace {
-enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint };
+class StateUpdateReporter {
+  const SubRegion *Reg;
+  NonLoc ByteOffsetVal;
+  std::optional<QualType> ElementType = std::nullopt;
+  std::optional<int64_t> ElementSize = std::nullopt;
+  bool AssumedNonNegative = false;
+  std::optional<NonLoc> AssumedUpperBound = std::nullopt;
+
+public:
+  StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
+                      CheckerContext &C)
+      : Reg(R), ByteOffsetVal(ByteOffsVal) {
+    initializeElementInfo(E, C);
+  }
+
+  void initializeElementInfo(const Expr *E, CheckerContext &C) {
+    if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) {
+      const MemRegion *SubscriptBaseReg =
+          C.getSVal(ASE->getBase()).getAsRegion();
+      if (!SubscriptBaseReg)
+        return;
+      SubscriptBaseReg = SubscriptBaseReg->StripCasts();
+      if (!isa_and_nonnull<ElementRegion>(SubscriptBaseReg)) {
+        ElementType = ASE->getType();
+        ElementSize =
+            C.getASTContext().getTypeSizeInChars(*ElementType).getQuantity();
+      }
+    }
+  }
+  void recordNonNegativeAssumption() { AssumedNonNegative = true; }
+  void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
+    AssumedUpperBound = UpperBoundVal;
+  }
+
+  const NoteTag *createNoteTag(CheckerContext &C) const;
+
+private:
+  std::string getMessage(PathSensitiveBugReport &BR) const;
+
+  /// Return true if information about the value of `Sym` can put constraints
+  /// on some symbol which is interesting within the bug report `BR`.
+  /// In particular, this returns true when `Sym` is interesting within `BR`;
+  /// but it also returns true if `Sym` is an expression that contains integer
+  /// constants and a single symbolic operand which is interesting (in `BR`).
+  /// We need to use this instead of plain `BR.isInteresting()` because if we
+  /// are analyzing code like
+  ///   int array[10];
+  ///   int f(int arg) {
+  ///     return array[arg] && array[arg + 10];
+  ///   }
+  /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
+  /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
+  /// to detect this out of bounds access).
+  static bool providesInformationAboutInteresting(SymbolRef Sym,
+                                                  PathSensitiveBugReport &BR);
+  static bool providesInformationAboutInteresting(SVal SV,
+                                                  PathSensitiveBugReport &BR) {
+    return providesInformationAboutInteresting(SV.getAsSymbol(), BR);
+  }
+};
 
 struct Messages {
   std::string Short, Full;
@@ -54,11 +113,14 @@ class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
 
   void performCheck(const Expr *E, CheckerContext &C) const;
 
-  void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind,
-                 NonLoc Offset, Messages Msgs) const;
+  void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
+                 NonLoc Offset, bool IsTaintBug = false) const;
 
   static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
 
+  static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
+                                       NonLoc Offset, NonLoc Limit,
+                                       CheckerContext &C);
   static bool isInAddressOf(const Stmt *S, ASTContext &AC);
 
 public:
@@ -133,12 +195,19 @@ computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
   return std::nullopt;
 }
 
-// TODO: once the constraint manager is smart enough to handle non simplified
-// symbolic expressions remove this function. Note that this can not be used in
-// the constraint manager as is, since this does not handle overflows. It is
-// safe to assume, however, that memory offsets will not overflow.
-// NOTE: callers of this function need to be aware of the effects of overflows
-// and signed<->unsigned conversions!
+// NOTE: This function is the "heart" of this checker. It simplifies
+// inequalities with transformations that are valid (and very elementary) in
+// pure mathematics, but become invalid if we use them in C++ number model
+// where the calculations may overflow.
+// Due to the overflow issues I think it's impossible (or at least not
+// practical) to integrate this kind of simplification into the resolution of
+// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
+// produces valid results if the arguments are memory offsets which are known
+// to be << SIZE_MAX.
+// TODO: This algorithm should be moved to a central location where it's
+// available for other checkers that need to compare memory offsets.
+// NOTE: When using the results of this function, don't forget that `evalBinOp`
+// uses the evaluation rules of C++, so e.g. `(size_t)123 < -1`!
 static std::pair<NonLoc, nonloc::ConcreteInt>
 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
                      SValBuilder &svalBuilder) {
@@ -239,13 +308,8 @@ static std::optional<int64_t> getConcreteValue(NonLoc SV) {
   return std::nullopt;
 }
 
-static std::string getShortMsg(OOB_Kind Kind, std::string RegName) {
-  static const char *ShortMsgTemplates[] = {
-      "Out of bound access to memory preceding {0}",
-      "Out of bound access to memory after the end of {0}",
-      "Potential out of bound access to {0} with tainted offset"};
-
-  return formatv(ShortMsgTemplates[Kind], RegName);
+static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
+  return SV ? getConcreteValue(*SV) : std::nullopt;
 }
 
 static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
@@ -255,7 +319,28 @@ static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
   Out << "Access of " << RegName << " at negative byte offset";
   if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
     Out << ' ' << ConcreteIdx->getValue();
-  return {getShortMsg(OOB_Precedes, RegName), std::string(Buf)};
+  return {formatv("Out of bound access to memory preceding {0}", RegName),
+          std::string(Buf)};
+}
+
+/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
+/// it can be performed (`Divisor` is nonzero and there is no remainder). The
+/// values `Val1` and `Val2` may be nullopt and in that case the corresponding
+/// division is considered to be successful.
+bool tryDividePair(std::optional<int64_t> &Val1, std::optional<int64_t> &Val2,
+                   int64_t Divisor) {
+  if (!Divisor)
+    return false;
+  const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
+  const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
+  if (!Val1HasRemainder && !Val2HasRemainder) {
+    if (Val1)
+      *Val1 /= Divisor;
+    if (Val2)
+      *Val2 /= Divisor;
+    return true;
+  }
+  return false;
 }
 
 static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
@@ -268,18 +353,9 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
   std::optional<int64_t> OffsetN = getConcreteValue(Offset);
   std::optional<int64_t> ExtentN = getConcreteValue(Extent);
 
-  bool UseByteOffsets = true;
-  if (int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity()) {
-    const bool OffsetHasRemainder = OffsetN && *OffsetN % ElemSize;
-    const bool ExtentHasRemainder = ExtentN && *ExtentN % ElemSize;
-    if (!OffsetHasRemainder && !ExtentHasRemainder) {
-      UseByteOffsets = false;
-      if (OffsetN)
-        *OffsetN /= ElemSize;
-      if (ExtentN)
-        *ExtentN /= ElemSize;
-    }
-  }
+  int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
+
+  bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
 
   SmallString<256> Buf;
   llvm::raw_svector_ostream Out(Buf);
@@ -307,7 +383,9 @@ static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
       Out << "s";
   }
 
-  return {getShortMsg(OOB_Exceeds, RegName), std::string(Buf)};
+  return {
+      formatv("Out of bound access to memory after the end of {0}", RegName),
+      std::string(Buf)};
 }
 
 static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
@@ -318,17 +396,87 @@ static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
                   RegName, OffsetName)};
 }
 
-void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
-  // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping
-  // some new logic here that reasons directly about memory region extents.
-  // Once that logic is more mature, we can bring it back to assumeInBound()
-  // for all clients to use.
-  //
-  // The algorithm we are using here for bounds checking is to see if the
-  // memory access is within the extent of the base region.  Since we
-  // have some flexibility in defining the base region, we can achieve
-  // various levels of conservatism in our buffer overflow checking.
+const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
+  // Don't create a note tag if we didn't assume anything:
+  if (!AssumedNonNegative && !AssumedUpperBound)
+    return nullptr;
+
+  return C.getNoteTag(
+      [*this](PathSensitiveBugReport &BR) -> std::string {
+        return getMessage(BR);
+      },
+      /*isPrunable=*/false);
+}
+
+std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
+  bool ShouldReportNonNegative = AssumedNonNegative;
+  if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) {
+    if (AssumedUpperBound &&
+        providesInformationAboutInteresting(*AssumedUpperBound, BR))
+      ShouldReportNonNegative = false;
+    else
+      return "";
+  }
+
+  std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal);
+  std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound);
 
+  const bool UseIndex =
+      ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize);
+
+  SmallString<256> Buf;
+  llvm::raw_svector_ostream Out(Buf);
+  Out << "Assuming ";
+  if (UseIndex) {
+    Out << "index ";
+    if (OffsetN)
+      Out << "'" << OffsetN << "' ";
+  } else if (AssumedUpperBound) {
+    Out << "byte offset ";
+    if (OffsetN)
+      Out << "'" << OffsetN << "' ";
+  } else {
+    Out << "offset ";
+  }
+
+  Out << "is";
+  if (ShouldReportNonNegative) {
+    Out << " non-negative";
+  }
+  if (AssumedUpperBound) {
+    if (ShouldReportNonNegative)
+      Out << " and";
+    Out << " less than ";
+    if (ExtentN)
+      Out << *ExtentN << ", ";
+    if (UseIndex && ElementType)
+      Out << "the number of '" << ElementType->getAsString()
+          << "' elements in ";
+    else
+      Out << "the extent of ";
+    Out << getRegionName(Reg);
+  }
+  return std::string(Out.str());
+}
+
+bool StateUpdateReporter::providesInformationAboutInteresting(
+    SymbolRef Sym, PathSensitiveBugReport &BR) {
+  if (!Sym)
+    return false;
+  for (SymbolRef PartSym : Sym->symbols()) {
+    // The interestingess mark may appear on any layer as we're stripping off
+    // the SymIntExpr, UnarySymExpr etc. layers...
+    if (BR.isInteresting(PartSym))
+      return true;
+    // ...but if both sides of the expression are symbolic (i.e. unknown), then
+    // the analyzer can't use the combined result to constrain the operands.
+    if (isa<SymSymExpr>(PartSym))
+      return false;
+  }
+  return false;
+}
+
+void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
   const SVal Location = C.getSVal(E);
 
   // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
@@ -350,6 +498,10 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
 
   auto [Reg, ByteOffset] = *RawOffset;
 
+  // The state updates will be reported as a single note tag, which will be
+  // composed by this helper class.
+  StateUpdateReporter SUR(Reg, ByteOffset, E, C);
+
   // CHECK LOWER BOUND
   const MemSpaceRegion *Space = Reg->getMemorySpace();
   if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
@@ -363,13 +515,22 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
     auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
         State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
 
-    if (PrecedesLowerBound && !WithinLowerBound) {
-      // We know that the index definitely precedes the lower bound.
-      Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
-      reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, Msgs);
-      return;
+    if (PrecedesLowerBound) {
+      // The offset may be invalid (negative)...
+      if (!WithinLowerBound) {
+        // ...and it cannot be valid (>= 0), so report an error.
+        Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
+        reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset);
+        return;
+      }
+      // ...but it can be valid as well, so the checker will (optimistically)
+      // assume that it's valid and mention this in the note tag.
+      SUR.recordNonNegativeAssumption();
     }
 
+    // Actually update the state. The "if" only fails in the extremely unlikely
+    // case when compareValueToThreshold returns {nullptr, nullptr} becasue
+    // evalBinOpNN fails to evaluate the less-than operator.
     if (WithinLowerBound)
       State = WithinLowerBound;
   }
@@ -381,32 +542,30 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
         compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
 
     if (ExceedsUpperBound) {
+      // The offset may be invalid (>= Size)...
       if (!WithinUpperBound) {
-        // We know that the index definitely exceeds the upper bound.
-        if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
-          // ...but this is within an addressof expression, so we need to check
-          // for the exceptional case that `&array[size]` is valid.
-          auto [EqualsToThreshold, NotEqualToThreshold] =
-              compareValueToThreshold(ExceedsUpperBound, ByteOffset, *KnownSize,
-                                      SVB, /*CheckEquality=*/true);
-          if (EqualsToThreshold && !NotEqualToThreshold) {
-            // We are definitely in the exceptional case, so return early
-            // instead of reporting a bug.
-            C.addTransition(EqualsToThreshold);
-            return;
-          }
+        // ...and it cannot be within bounds, so report an error, unless we can
+        // definitely determine that this is an idiomatic `&array[size]`
+        // expression that calculates the past-the-end pointer.
+        if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
+                                     *KnownSize, C)) {
+          // FIXME: this duplicates the `addTransition` at the end of the
+          // function, but `goto` is taboo nowdays.
+          C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
+          return;
         }
+
         Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
                                        *KnownSize, Location);
-        reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs);
+        reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset);
         return;
       }
+      // ...and it can be valid as well...
       if (isTainted(State, ByteOffset)) {
-        // Both cases are possible, but the offset is tainted, so report.
-        std::string RegName = getRegionName(Reg);
+        // ...but it's tainted, so report an error.
 
-        // Diagnostic detail: "tainted offset" is always correct, but the
-        // common case is that 'idx' is tainted in 'arr[idx]' and then it's
+        // Diagnostic detail: saying "tainted offset" is always correct, but
+        // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
         // nicer to say "tainted index".
         const char *OffsetName = "offset";
         if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
@@ -414,33 +573,67 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
             OffsetName = "index";
 
         Messages Msgs = getTaintMsgs(Reg, OffsetName);
-        reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, Msgs);
+        reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, /*IsTaintBug=*/true);
         return;
       }
+      // ...and it isn't tainted, so the checker will (optimistically) assume
+      // that the offset is in bounds and mention this in the note tag.
+      SUR.recordUpperBoundAssumption(*KnownSize);
     }
 
+    // Actually update the state. The "if" only fails in the extremely unlikely
+    // case when compareValueToThreshold returns {nullptr, nullptr} becasue
+    // evalBinOpNN fails to evaluate the less-than operator.
     if (WithinUpperBound)
       State = WithinUpperBound;
   }
 
-  C.addTransition(State);
+  // Add a transition, reporting the state updates that we accumulated.
+  C.addTransition(State, SUR.createNoteTag(C));
 }
 
 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
-                                    ProgramStateRef ErrorState, OOB_Kind Kind,
-                                    NonLoc Offset, Messages Msgs) const {
+                                    ProgramStateRef ErrorState, Messages Msgs,
+                                    NonLoc Offset, bool IsTaintBug) const {
 
   ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
   if (!ErrorNode)
     return;
 
   auto BR = std::make_unique<PathSensitiveBugReport>(
-      Kind == OOB_Taint ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
+      IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
+
+  // FIXME: ideally we would just call trackExpressionValue() and that would
+  // "do the right thing": mark the relevant symbols as interesting, track the
+  // control dependencies and statements storing the relevant values and add
+  // helpful diagnostic pieces. However, right now trackExpressionValue() is
+  // a heap of unreliable heuristics, so it would cause several issues:
+  // - Interestingness is not applied consistently, e.g. if `array[x+10]`
+  //   causes an overflow, then `x` is not marked as interesting.
+  // - We get irrelevant diagnostic pieces, e.g. in the code
+  //   `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
+  //   it places a "Storing uninitialized value" note on the `malloc` call
+  //   (which is technically true, but irrelevant).
+  // If trackExpressionValue() becomes reliable, it should be applied instead
+  // of the manual markInteresting() calls.
+
+  if (SymbolRef OffsetSym = Offset.getAsSymbol()) {
+    // If the offset is a symbolic value, iterate over its "parts" with
+    // `SymExpr::symbols()` and mark each of them as interesting.
+    // For example, if the offset is `x*4 + y` then we put interestingness onto
+    // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
+    // `x` and `y`.
+    for (SymbolRef PartSym : OffsetSym->symbols())
+      BR->markInteresting(PartSym);
+  }
 
-  // Track back the propagation of taintedness.
-  if (Kind == OOB_Taint)
+  if (IsTaintBug) {
+    // If the issue that we're reporting depends on the taintedness of the
+    // offset, then put interestingness onto symbols that could be the origin
+    // of the taint.
     for (SymbolRef Sym : getTaintedSymbols(ErrorState, Offset))
       BR->markInteresting(Sym);
+  }
 
   C.emitReport(std::move(BR));
 }
@@ -476,6 +669,18 @@ bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
   return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
 }
 
+bool ArrayBoundCheckerV2::isIdiomaticPastTheEndPtr(const Expr *E,
+                                                   ProgramStateRef State,
+                                                   NonLoc Offset, NonLoc Limit,
+                                                   CheckerContext &C) {
+  if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
+    auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
+        State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true);
+    return EqualsToThreshold && !NotEqualToThreshold;
+  }
+  return false;
+}
+
 void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
   mgr.registerChecker<ArrayBoundCheckerV2>();
 }
diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index 769a8954f796691..7a8f59650fef161 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -195,12 +195,22 @@ void *malloc(size_t size);
 
 int *mallocRegion(void) {
   int *mem = (int*)malloc(2*sizeof(int));
+
   mem[3] = -2;
   // expected-warning at -1 {{Out of bound access to memory after the end of the heap area}}
   // expected-note at -2 {{Access of the heap area at index 3, while it holds only 2 'int' elements}}
   return mem;
 }
 
+int *mallocRegionDeref(void) {
+  int *mem = (int*)malloc(2*sizeof(int));
+
+  *(mem + 3) = -2;
+  // expected-warning at -1 {{Out of bound access to memory after the end of the heap area}}
+  // expected-note at -2 {{Access of the heap area at index 3, while it holds only 2 'int' elements}}
+  return mem;
+}
+
 void *alloca(size_t size);
 
 int allocaRegion(void) {
diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c
new file mode 100644
index 000000000000000..290df79f3e18a41
--- /dev/null
+++ b/clang/test/Analysis/out-of-bounds-notes.c
@@ -0,0 +1,128 @@
+// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-output=text        \
+// RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint,debug.ExprInspection -verify %s
+
+int array[10];
+
+void clang_analyzer_value(unsigned arg);
+
+int irrelevantAssumptions(int arg) {
+  int a = array[arg];
+  // Here the analyzer assumes that `arg` is in bounds, but doesn't report this
+  // because `arg` is not interesting for the bug.
+  int b = array[13];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at index 13, while it holds only 10 'int' elements}}
+  return a + b;
+}
+
+
+int assumingBoth(int arg) {
+  int a = array[arg];
+  // expected-note at -1 {{Assuming index is non-negative and less than 10, the number of 'int' elements in 'array'}}
+  int b = array[arg]; // no additional note, we already assumed that 'arg' is in bounds
+  int c = array[arg + 10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  return a + b + c;
+}
+
+int assumingLower(int arg) {
+  // expected-note at +2 {{Assuming 'arg' is < 10}}
+  // expected-note at +1 {{Taking false branch}}
+  if (arg >= 10)
+    return 0;
+  int a = array[arg];
+  // expected-note at -1 {{Assuming index is non-negative}}
+  int b = array[arg + 10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  return a + b;
+}
+
+int assumingUpper(int arg) {
+  // expected-note at +2 {{Assuming 'arg' is >= 0}}
+  // expected-note at +1 {{Taking false branch}}
+  if (arg < 0)
+    return 0;
+  int a = array[arg];
+  // expected-note at -1 {{Assuming index is less than 10, the number of 'int' elements in 'array'}}
+  int b = array[arg - 10];
+  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
+  // expected-note at -2 {{Access of 'array' at negative byte offset}}
+  return a + b;
+}
+
+int assumingUpperIrrelevant(int arg) {
+  // FIXME: The assumption "assuming index is less than 10" is printed because
+  // it's assuming something about the interesting variable `arg`; however,
+  // it's irrelevant because in this testcase the out of bound access is
+  // deduced from the _lower_ bound on `arg`. Currently the analyzer cannot
+  // filter out assumptions that are logically irrelevant but "touch"
+  // interesting symbols; eventually it would be good to add support for this.
+
+  // expected-note at +2 {{Assuming 'arg' is >= 0}}
+  // expected-note at +1 {{Taking false branch}}
+  if (arg < 0)
+    return 0;
+  int a = array[arg];
+  // expected-note at -1 {{Assuming index is less than 10, the number of 'int' elements in 'array'}}
+  int b = array[arg + 10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  return a + b;
+}
+
+int assumingUpperUnsigned(unsigned arg) {
+  int a = array[arg];
+  // expected-note at -1 {{Assuming index is less than 10, the number of 'int' elements in 'array'}}
+  int b = array[(int)arg - 10];
+  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
+  // expected-note at -2 {{Access of 'array' at negative byte offset}}
+  return a + b;
+}
+
+int assumingNothing(unsigned arg) {
+  // expected-note at +2 {{Assuming 'arg' is < 10}}
+  // expected-note at +1 {{Taking false branch}}
+  if (arg >= 10)
+    return 0;
+  int a = array[arg]; // no note here, we already know that 'arg' is in bounds
+  int b = array[(int)arg - 10];
+  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
+  // expected-note at -2 {{Access of 'array' at negative byte offset}}
+  return a + b;
+}
+
+short assumingConverted(int arg) {
+  // When indices are reported, the note will use the element type that's the
+  // result type of the subscript operator.
+  char *cp = (char*)array;
+  char a = cp[arg];
+  // expected-note at -1 {{Assuming index is non-negative and less than 40, the number of 'char' elements in 'array'}}
+  char b = cp[arg]; // no additional note, we already assumed that 'arg' is in bounds
+  char c = cp[arg + 40];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 40 'char' elements}}
+  return a + b + c;
+}
+
+struct foo {
+  int num;
+  char a[8];
+  char b[5];
+};
+
+int assumingConverted2(struct foo f, int arg) {
+  // When indices are reported, the note will use the element type that's the
+  // result type of the subscript operator.
+  int a = ((int*)(f.a))[arg];
+  // expected-note at -1 {{Assuming index is non-negative and less than 2, the number of 'int' elements in 'f.a'}}
+  // However, if the extent of the memory region is not divisible by the
+  // element size, the checker measures the offset and extent in bytes.
+  int b = ((int*)(f.b))[arg];
+  // expected-note at -1 {{Assuming byte offset is less than 5, the extent of 'f.b'}}
+  int c = array[arg-2];
+  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
+  // expected-note at -2 {{Access of 'array' at negative byte offset}}
+  return a + b + c;
+}

>From c0a09db0fbb48626bbb069f85a587d2c94a8998a Mon Sep 17 00:00:00 2001
From: NagyDonat <donat.nagy at ericsson.com>
Date: Fri, 19 Jan 2024 19:10:54 +0100
Subject: [PATCH 2/5] Remove unused reference to debug.ExprInspection

---
 clang/test/Analysis/out-of-bounds-notes.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c
index 290df79f3e18a41..3070ab643d832ea 100644
--- a/clang/test/Analysis/out-of-bounds-notes.c
+++ b/clang/test/Analysis/out-of-bounds-notes.c
@@ -1,10 +1,8 @@
 // RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-output=text        \
-// RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint,debug.ExprInspection -verify %s
+// RUN:     -analyzer-checker=core,alpha.security.ArrayBoundV2,unix.Malloc,alpha.security.taint -verify %s
 
 int array[10];
 
-void clang_analyzer_value(unsigned arg);
-
 int irrelevantAssumptions(int arg) {
   int a = array[arg];
   // Here the analyzer assumes that `arg` is in bounds, but doesn't report this

>From 18c00f79123ed8b22feefdb3b4bcc2f08b59c79b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Mon, 22 Jan 2024 15:26:57 +0100
Subject: [PATCH 3/5] Improve comments (as suggested on code review)

---
 .../Checkers/ArrayBoundCheckerV2.cpp          | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 4241fa3a2ce8af6..593095a2bebafef 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -202,12 +202,19 @@ computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
 // Due to the overflow issues I think it's impossible (or at least not
 // practical) to integrate this kind of simplification into the resolution of
 // arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
-// produces valid results if the arguments are memory offsets which are known
-// to be << SIZE_MAX.
+// produces valid results when the calculations are handling memory offsets
+// and every value is well below SIZE_MAX.
 // TODO: This algorithm should be moved to a central location where it's
 // available for other checkers that need to compare memory offsets.
-// NOTE: When using the results of this function, don't forget that `evalBinOp`
-// uses the evaluation rules of C++, so e.g. `(size_t)123 < -1`!
+// NOTE: the simplification preserves the order of the two operands in a
+// mathematical sense, but it may change the result produced by a C++
+// comparison operator (and the automatic type conversions).
+// For example, consider a comparison "X+1 < 0", where the LHS is stored as a
+// size_t and the RHS is stored in an int. (As size_t is unsigned, this
+// comparison is false for all values of "X".) However, the simplification may
+// turn it into "X < -1", which is still always false in a mathematical sense,
+// but can produce a true result when evaluated by `evalBinOp` (which follows
+// the rules of C++ and casts -1 to SIZE_MAX).
 static std::pair<NonLoc, nonloc::ConcreteInt>
 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
                      SValBuilder &svalBuilder) {
@@ -401,11 +408,9 @@ const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
   if (!AssumedNonNegative && !AssumedUpperBound)
     return nullptr;
 
-  return C.getNoteTag(
-      [*this](PathSensitiveBugReport &BR) -> std::string {
-        return getMessage(BR);
-      },
-      /*isPrunable=*/false);
+  return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string {
+    return getMessage(BR);
+  });
 }
 
 std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
@@ -468,8 +473,9 @@ bool StateUpdateReporter::providesInformationAboutInteresting(
     // the SymIntExpr, UnarySymExpr etc. layers...
     if (BR.isInteresting(PartSym))
       return true;
-    // ...but if both sides of the expression are symbolic (i.e. unknown), then
-    // the analyzer can't use the combined result to constrain the operands.
+    // ...but if both sides of the expression are symbolic, then there is no
+    // practical algorithm to produce separate constraints for the two
+    // operands (from the single combined result).
     if (isa<SymSymExpr>(PartSym))
       return false;
   }
@@ -549,8 +555,6 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
         // expression that calculates the past-the-end pointer.
         if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
                                      *KnownSize, C)) {
-          // FIXME: this duplicates the `addTransition` at the end of the
-          // function, but `goto` is taboo nowdays.
           C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
           return;
         }

>From 6ed41d51078796983983b777af6067056de78299 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Mon, 22 Jan 2024 15:56:30 +0100
Subject: [PATCH 4/5] Make a function 'static'

---
 clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 593095a2bebafef..e6f0cc0cbf0dd31 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -334,8 +334,8 @@ static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
 /// it can be performed (`Divisor` is nonzero and there is no remainder). The
 /// values `Val1` and `Val2` may be nullopt and in that case the corresponding
 /// division is considered to be successful.
-bool tryDividePair(std::optional<int64_t> &Val1, std::optional<int64_t> &Val2,
-                   int64_t Divisor) {
+static bool tryDividePair(std::optional<int64_t> &Val1,
+                          std::optional<int64_t> &Val2, int64_t Divisor) {
   if (!Divisor)
     return false;
   const bool Val1HasRemainder = Val1 && *Val1 % Divisor;

>From 1413b2cce38859e867f23e65d418b97a112093c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Mon, 22 Jan 2024 17:11:08 +0100
Subject: [PATCH 5/5] Add testcase 'AssumingPlainOffset'

...and give more meaningful names to two testcases before it.

Currently this TC doesn't demonstrate the intended behavior because the
simplification algorithm used by the checker is somewhat limited and it
doesn't perform divisions when the RHS is not divisible by the
multiplier that's present in LHS. This will be fixed in a separate PR
that improves the simplification algorithm.
---
 clang/test/Analysis/out-of-bounds-notes.c | 26 +++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/clang/test/Analysis/out-of-bounds-notes.c b/clang/test/Analysis/out-of-bounds-notes.c
index 3070ab643d832ea..be69526393cd483 100644
--- a/clang/test/Analysis/out-of-bounds-notes.c
+++ b/clang/test/Analysis/out-of-bounds-notes.c
@@ -91,7 +91,7 @@ int assumingNothing(unsigned arg) {
   return a + b;
 }
 
-short assumingConverted(int arg) {
+short assumingConvertedToCharP(int arg) {
   // When indices are reported, the note will use the element type that's the
   // result type of the subscript operator.
   char *cp = (char*)array;
@@ -110,7 +110,7 @@ struct foo {
   char b[5];
 };
 
-int assumingConverted2(struct foo f, int arg) {
+int assumingConvertedToIntP(struct foo f, int arg) {
   // When indices are reported, the note will use the element type that's the
   // result type of the subscript operator.
   int a = ((int*)(f.a))[arg];
@@ -124,3 +124,25 @@ int assumingConverted2(struct foo f, int arg) {
   // expected-note at -2 {{Access of 'array' at negative byte offset}}
   return a + b + c;
 }
+
+int assumingPlainOffset(struct foo f, int arg) {
+  // This TC is intended to check the corner case that the checker prints the
+  // shorter "offset" instead of "byte offset" when it's irrelevant that the
+  // offset is measured in bytes.
+
+  // expected-note at +2 {{Assuming 'arg' is < 2}}
+  // expected-note at +1 {{Taking false branch}}
+  if (arg >= 2)
+    return 0;
+
+  int b = ((int*)(f.b))[arg];
+  // expected-note at -1 {{Assuming byte offset is non-negative and less than 5, the extent of 'f.b'}}
+  // FIXME: this should be {{Assuming offset is non-negative}}
+  // but the current simplification algorithm doesn't realize that arg <= 1
+  // implies that the byte offset arg*4 will be less than 5.
+
+  int c = array[arg+10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  return b + c;
+}