[clang] [analyzer] Switch to PostStmt callbacks in ArrayBoundV2 (PR #72107)

Fri Nov 24 10:02:12 PST 2023

=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>,
=?utf-8?q?Donát?= Nagy <donat.nagy at ericsson.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/72107 at github.com>


https://github.com/DonatNagyE updated https://github.com/llvm/llvm-project/pull/72107

>From ab102e949994a4462382e4c10c0153d61fb00306 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Mon, 13 Nov 2023 11:16:26 +0100
Subject: [PATCH 1/5] [analyzer] Switch to PostStmt callbacks in ArrayBoundV2

...instead of the currently used, more abstract Location callback. The
main advantage of this change is that after it the checker will check
`array[index].field` while the previous implementation ignored this
situation (because here the ElementRegion is wrapped in a FieldRegion
object). This improvement fixes PR #70187.

Note that after this change `&array[idx]` will be handled as an access
to the `idx`th element of `array`, which is technically incorrect but
matches the programmer intuitions. In my opinion it's more helpful if
the report points to the source location where the indexing happens
(instead of the location where a pointer is finally dereferenced).

This change introduces false positives in the exceptional corner case
when the past-the-end pointer of an array is formed as `&arr[length]`. I
think this is just unimportant pedantery (because it's cleaner to write
the past-the-end pointer as `(arr+length)` and that form isn't affected
by this checker), but if it does appear in real code, then we could
introduce a special case for it.

In addition to this primary improvement, this change tweaks the message
for the tainted index/offset case (using the more concrete information
that's available now) and clarifies/improves a few testcases.

The main change of this commit (replacing `check::Location` with
`check::PostStmt<...>` callbacks) was already proposed in my change
https://reviews.llvm.org/D150446 and https://reviews.llvm.org/D159107 by
@steakhal. Those reviews were both abandoned, but the issues were
unrelated to the change that is introduced in this PR.
---
 .../Checkers/ArrayBoundCheckerV2.cpp          | 96 ++++++++++++-------
 .../test/Analysis/out-of-bounds-diagnostics.c | 74 ++++++++++++--
 .../test/Analysis/taint-diagnostic-visitor.c  |  4 +-
 3 files changed, 129 insertions(+), 45 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index ffc7236d1e2551a..ef3ae42d57c6e88 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -34,20 +34,37 @@ using llvm::formatv;
 namespace {
 enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint };
 
-class ArrayBoundCheckerV2 :
-    public Checker<check::Location> {
+struct Messages {
+  std::string Short, Full;
+};
+
+class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
+                                           check::PostStmt<UnaryOperator>,
+                                           check::PostStmt<MemberExpr>> {
   BugType BT{this, "Out-of-bound access"};
   BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
 
+  void performCheck(const Expr *E, CheckerContext &C) const;
+
   void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind,
-                 NonLoc Offset, std::string RegName, std::string Msg) const;
+                 NonLoc Offset, Messages Msgs) const;
 
   static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
 
 public:
-  void checkLocation(SVal l, bool isLoad, const Stmt *S,
-                     CheckerContext &C) const;
+  void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
+    performCheck(E, C);
+  }
+  void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
+    if (E->getOpcode() == UO_Deref)
+      performCheck(E, C);
+  }
+  void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
+    if (E->isArrow())
+      performCheck(E->getBase(), C);
+  }
 };
+
 } // anonymous namespace
 
 /// For a given Location that can be represented as a symbolic expression
@@ -217,16 +234,19 @@ static std::string getShortMsg(OOB_Kind Kind, std::string RegName) {
   return formatv(ShortMsgTemplates[Kind], RegName);
 }
 
-static std::string getPrecedesMsg(std::string RegName, NonLoc Offset) {
+static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
+  std::string RegName = getRegionName(Region);
   SmallString<128> Buf;
   llvm::raw_svector_ostream Out(Buf);
   Out << "Access of " << RegName << " at negative byte offset";
   if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
     Out << ' ' << ConcreteIdx->getValue();
-  return std::string(Buf);
+  return {getShortMsg(OOB_Precedes, RegName), std::string(Buf)};
 }
-static std::string getExceedsMsg(ASTContext &ACtx, std::string RegName,
-                                 NonLoc Offset, NonLoc Extent, SVal Location) {
+
+static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
+                               NonLoc Offset, NonLoc Extent, SVal Location) {
+  std::string RegName = getRegionName(Region);
   const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
   assert(EReg && "this checker only handles element access");
   QualType ElemType = EReg->getElementType();
@@ -273,20 +293,18 @@ static std::string getExceedsMsg(ASTContext &ACtx, std::string RegName,
       Out << "s";
   }
 
-  return std::string(Buf);
-}
-static std::string getTaintMsg(std::string RegName) {
-  SmallString<128> Buf;
-  llvm::raw_svector_ostream Out(Buf);
-  Out << "Access of " << RegName
-      << " with a tainted offset that may be too large";
-  return std::string(Buf);
+  return {getShortMsg(OOB_Exceeds, RegName), std::string(Buf)};
 }
 
-void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
-                                        const Stmt *LoadS,
-                                        CheckerContext &C) const {
+static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
+  std::string RegName = getRegionName(Region);
+  return {formatv("Potential out of bound access to {0} with tainted {1}",
+                  RegName, OffsetName),
+          formatv("Access of {0} with a tainted {1} that may be too large",
+                  RegName, OffsetName)};
+}
 
+void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
   // NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping
   // some new logic here that reasons directly about memory region extents.
   // Once that logic is more mature, we can bring it back to assumeInBound()
@@ -297,12 +315,14 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
   // have some flexibility in defining the base region, we can achieve
   // various levels of conservatism in our buffer overflow checking.
 
+  const SVal Location = C.getSVal(E);
+
   // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
   //   #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
   // and incomplete analysis of these leads to false positives. As even
   // accurate reports would be confusing for the users, just disable reports
   // from these macros:
-  if (isFromCtypeMacro(LoadS, C.getASTContext()))
+  if (isFromCtypeMacro(E, C.getASTContext()))
     return;
 
   ProgramStateRef State = C.getState();
@@ -331,9 +351,8 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
 
     if (PrecedesLowerBound && !WithinLowerBound) {
       // We know that the index definitely precedes the lower bound.
-      std::string RegName = getRegionName(Reg);
-      std::string Msg = getPrecedesMsg(RegName, ByteOffset);
-      reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, RegName, Msg);
+      Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
+      reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, Msgs);
       return;
     }
 
@@ -350,17 +369,25 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
     if (ExceedsUpperBound) {
       if (!WithinUpperBound) {
         // We know that the index definitely exceeds the upper bound.
-        std::string RegName = getRegionName(Reg);
-        std::string Msg = getExceedsMsg(C.getASTContext(), RegName, ByteOffset,
-                                        *KnownSize, Location);
-        reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, RegName, Msg);
+        Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
+                                       *KnownSize, Location);
+        reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs);
         return;
       }
       if (isTainted(State, ByteOffset)) {
-        // Both cases are possible, but the index is tainted, so report.
+        // Both cases are possible, but the offset is tainted, so report.
         std::string RegName = getRegionName(Reg);
-        std::string Msg = getTaintMsg(RegName);
-        reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, RegName, Msg);
+
+        // Diagnostic detail: "tainted offset" is always correct, but the
+        // common case is that 'idx' is tainted in 'arr[idx]' and then it's
+        // nicer to say "tainted index".
+        const char *OffsetName = "offset";
+        if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
+          if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
+            OffsetName = "index";
+
+        Messages Msgs = getTaintMsgs(Reg, OffsetName);
+        reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, Msgs);
         return;
       }
     }
@@ -374,17 +401,14 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
 
 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
                                     ProgramStateRef ErrorState, OOB_Kind Kind,
-                                    NonLoc Offset, std::string RegName,
-                                    std::string Msg) const {
+                                    NonLoc Offset, Messages Msgs) const {
 
   ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
   if (!ErrorNode)
     return;
 
-  std::string ShortMsg = getShortMsg(Kind, RegName);
-
   auto BR = std::make_unique<PathSensitiveBugReport>(
-      Kind == OOB_Taint ? TaintBT : BT, ShortMsg, Msg, ErrorNode);
+      Kind == OOB_Taint ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
 
   // Track back the propagation of taintedness.
   if (Kind == OOB_Taint)
diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index 4f5d0d0bc91c54a..e549e86b09c2312 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -9,6 +9,14 @@ void arrayUnderflow(void) {
   // expected-note at -2 {{Access of 'array' at negative byte offset -12}}
 }
 
+int underflowWithDeref(void) {
+  int *p = array;
+  --p;
+  return *p;
+  // expected-warning at -1 {{Out of bound access to memory preceding 'array'}}
+  // expected-note at -2 {{Access of 'array' at negative byte offset -4}}
+}
+
 int scanf(const char *restrict fmt, ...);
 
 void taintedIndex(void) {
@@ -17,6 +25,17 @@ void taintedIndex(void) {
   // expected-note at -1 {{Taint originated here}}
   // expected-note at -2 {{Taint propagated to the 2nd argument}}
   array[index] = 5;
+  // expected-warning at -1 {{Potential out of bound access to 'array' with tainted index}}
+  // expected-note at -2 {{Access of 'array' with a tainted index that may be too large}}
+}
+
+void taintedOffset(void) {
+  int index;
+  scanf("%d", &index);
+  // expected-note at -1 {{Taint originated here}}
+  // expected-note at -2 {{Taint propagated to the 2nd argument}}
+  int *p = array + index;
+  p[0] = 5;
   // expected-warning at -1 {{Potential out of bound access to 'array' with tainted offset}}
   // expected-note at -2 {{Access of 'array' with a tainted offset that may be too large}}
 }
@@ -27,6 +46,29 @@ void arrayOverflow(void) {
   // expected-note at -2 {{Access of 'array' at index 12, while it holds only 10 'int' elements}}
 }
 
+void flippedOverflow(void) {
+  12[array] = 5;
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at index 12, while it holds only 10 'int' elements}}
+}
+
+int *afterTheEndPtr(void) {
+  // FIXME: this is an unusual but standard-compliant way of writing (array + 10).
+  // I think people who rely on this confusing corner case deserve a warning,
+  // but if this is widespread, then we could introduce a special case for it
+  // in the checker.
+  return &array[10];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at index 10, while it holds only 10 'int' elements}}
+}
+
+int *afterAfterTheEndPtr(void) {
+  // This is UB, it's invalid to form an after-after-the-end pointer.
+  return &array[11];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at index 11, while it holds only 10 'int' elements}}
+}
+
 int scalar;
 int scalarOverflow(void) {
   return (&scalar)[1];
@@ -41,12 +83,6 @@ int oneElementArrayOverflow(void) {
   // expected-note at -2 {{Access of 'oneElementArray' at index 1, while it holds only a single 'int' element}}
 }
 
-short convertedArray(void) {
-  return ((short*)array)[47];
-  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
-  // expected-note at -2 {{Access of 'array' at index 47, while it holds only 20 'short' elements}}
-}
-
 struct vec {
   int len;
   double elems[64];
@@ -64,6 +100,28 @@ double arrayInStructPtr(struct vec *pv) {
   // expected-note at -2 {{Access of the field 'elems' at index 64, while it holds only 64 'double' elements}}
 }
 
+struct item {
+  int a, b;
+} itemArray[20] = {0};
+
+int structOfArrays(void) {
+  return itemArray[35].a;
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'itemArray'}}
+  // expected-note at -2 {{Access of 'itemArray' at index 35, while it holds only 20 'struct item' elements}}
+}
+
+int structOfArraysArrow(void) {
+  return (itemArray + 35)->b;
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'itemArray'}}
+  // expected-note at -2 {{Access of 'itemArray' at index 35, while it holds only 20 'struct item' elements}}
+}
+
+short convertedArray(void) {
+  return ((short*)array)[47];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at index 47, while it holds only 20 'short' elements}}
+}
+
 struct two_bytes {
   char lo, hi;
 };
@@ -85,7 +143,9 @@ int intFromString(void) {
 }
 
 int intFromStringDivisible(void) {
-  // However, this is reported with indices/elements, because the extent happens to be a multiple of 4.
+  // However, this is reported with indices/elements, because the extent
+  // (of the string that consists of 'a', 'b', 'c' and '\0') happens to be a
+  // multiple of 4 bytes (= sizeof(int)).
   return ((const int*)"abc")[20];
   // expected-warning at -1 {{Out of bound access to memory after the end of the string literal}}
   // expected-note at -2 {{Access of the string literal at index 20, while it holds only a single 'int' element}}
diff --git a/clang/test/Analysis/taint-diagnostic-visitor.c b/clang/test/Analysis/taint-diagnostic-visitor.c
index 67dc67e627b3b9c..a3fa1639bffeee5 100644
--- a/clang/test/Analysis/taint-diagnostic-visitor.c
+++ b/clang/test/Analysis/taint-diagnostic-visitor.c
@@ -29,8 +29,8 @@ int taintDiagnosticOutOfBound(void) {
   int Array[] = {1, 2, 3, 4, 5};
   scanf("%d", &index); // expected-note {{Taint originated here}}
                        // expected-note at -1 {{Taint propagated to the 2nd argument}}
-  return Array[index]; // expected-warning {{Potential out of bound access to 'Array' with tainted offset}}
-                       // expected-note at -1 {{Access of 'Array' with a tainted offset that may be too large}}
+  return Array[index]; // expected-warning {{Potential out of bound access to 'Array' with tainted index}}
+                       // expected-note at -1 {{Access of 'Array' with a tainted index that may be too large}}
 }
 
 int taintDiagnosticDivZero(int operand) {

>From 819a21bb2e59170ccc74c2753992be008c77eea6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Fri, 24 Nov 2023 15:17:46 +0100
Subject: [PATCH 2/5] Explain the use of PostStmt callbacks

---
 clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index ef3ae42d57c6e88..dd410b34b060938 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -38,6 +38,11 @@ struct Messages {
   std::string Short, Full;
 };
 
+// NOTE: This is a `PostStmt` checker because the implementation passes the
+// whole subscript or unary operator expression to `CheckerContext::getSVal()`,
+// so those two callback wouldn't work as `PreStmt` (unless this checker
+// duplicated the code evaluating those expressions). The `MemberExpr` callback
+// would work as `PreStmt`, but it's `PostStmt` for the sake of consistency.
 class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
                                            check::PostStmt<UnaryOperator>,
                                            check::PostStmt<MemberExpr>> {

>From 5ae2f66523e76d54c9b068fc20cee771cc23efd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Fri, 24 Nov 2023 15:52:26 +0100
Subject: [PATCH 3/5] Fix an inaccurate testcase name

---
 clang/test/Analysis/out-of-bounds-diagnostics.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index e549e86b09c2312..d24f0a98db533fd 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -104,13 +104,13 @@ struct item {
   int a, b;
 } itemArray[20] = {0};
 
-int structOfArrays(void) {
+int arrayOfStructs(void) {
   return itemArray[35].a;
   // expected-warning at -1 {{Out of bound access to memory after the end of 'itemArray'}}
   // expected-note at -2 {{Access of 'itemArray' at index 35, while it holds only 20 'struct item' elements}}
 }
 
-int structOfArraysArrow(void) {
+int arrayOfStructsArrow(void) {
   return (itemArray + 35)->b;
   // expected-warning at -1 {{Out of bound access to memory after the end of 'itemArray'}}
   // expected-note at -2 {{Access of 'itemArray' at index 35, while it holds only 20 'struct item' elements}}

>From 4044f86365a7440ee89e69b1ef389479d9884730 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Fri, 24 Nov 2023 18:55:08 +0100
Subject: [PATCH 4/5] Add special case for &array[size]

---
 .../Checkers/ArrayBoundCheckerV2.cpp          | 37 +++++++++++++-
 .../test/Analysis/out-of-bounds-diagnostics.c | 49 +++++++++++++++++--
 2 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index dd410b34b060938..19b075ae39ac72d 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/CharUnits.h"
+#include "clang/AST/ParentMapContext.h"
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Checkers/Taint.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
@@ -56,6 +57,8 @@ class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
 
   static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
 
+  static bool isInAddressOf(const Stmt *S, ASTContext &AC);
+
 public:
   void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
     performCheck(E, C);
@@ -171,9 +174,11 @@ getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
 // where the first one corresponds to "value below threshold" and the second
 // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
 // the case when the evaluation fails.
+// If the optional argument CheckEquality is true, then use BO_EQ instead of
+// the default BO_LT after consistently applying the same simplification steps.
 static std::pair<ProgramStateRef, ProgramStateRef>
 compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
-                        SValBuilder &SVB) {
+                        SValBuilder &SVB, bool CheckEquality = false) {
   if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
     std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
   }
@@ -189,8 +194,10 @@ compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
       return {nullptr, State};
     }
   }
+  const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
   auto BelowThreshold =
-      SVB.evalBinOpNN(State, BO_LT, Value, Threshold, SVB.getConditionType()).getAs<NonLoc>();
+      SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
+          .getAs<NonLoc>();
 
   if (BelowThreshold)
     return State->assume(*BelowThreshold);
@@ -374,6 +381,19 @@ void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
     if (ExceedsUpperBound) {
       if (!WithinUpperBound) {
         // We know that the index definitely exceeds the upper bound.
+        if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
+          // ...but this is within an addressof expression, so we need to check
+          // for the exceptional case that `&array[size]` is valid.
+          auto [EqualsToThreshold, NotEqualToThreshold] =
+              compareValueToThreshold(ExceedsUpperBound, ByteOffset, *KnownSize,
+                                      SVB, /*CheckEquality=*/true);
+          if (EqualsToThreshold && !NotEqualToThreshold) {
+            // We are definitely in the exceptional case, so return early
+            // instead of reporting a bug.
+            C.addTransition(EqualsToThreshold);
+            return;
+          }
+        }
         Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
                                        *KnownSize, Location);
         reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs);
@@ -442,6 +462,19 @@ bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
           (MacroName == "isupper") || (MacroName == "isxdigit"));
 }
 
+bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
+  ParentMapContext &ParentCtx = ACtx.getParentMapContext();
+  do {
+    const DynTypedNodeList Parents = ParentCtx.getParents(*S);
+    if (Parents.empty())
+      return false;
+    S = Parents[0].get<Stmt>();
+  } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
+  if (const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S))
+    return UnaryOp->getOpcode() == UO_AddrOf;
+  return false;
+}
+
 void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
   mgr.registerChecker<ArrayBoundCheckerV2>();
 }
diff --git a/clang/test/Analysis/out-of-bounds-diagnostics.c b/clang/test/Analysis/out-of-bounds-diagnostics.c
index d24f0a98db533fd..a6bb04750bb8114 100644
--- a/clang/test/Analysis/out-of-bounds-diagnostics.c
+++ b/clang/test/Analysis/out-of-bounds-diagnostics.c
@@ -29,6 +29,27 @@ void taintedIndex(void) {
   // expected-note at -2 {{Access of 'array' with a tainted index that may be too large}}
 }
 
+int *taintedIndexAfterTheEndPtr(void) {
+  // NOTE: Technically speaking, this testcase does not trigger any UB because
+  // &array[10] is the after-the-end pointer which is well-defined; but this is
+  // a bug-prone situation and far from the idiomatic use of `&array[size]`, so
+  // it's better to report an error. This report can be easily silenced by
+  // writing array+index instead of &array[index].
+  int index;
+  scanf("%d", &index);
+  // expected-note at -1 {{Taint originated here}}
+  // expected-note at -2 {{Taint propagated to the 2nd argument}}
+  if (index < 0 || index > 10)
+    return array;
+  // expected-note at -2 {{Assuming 'index' is >= 0}}
+  // expected-note at -3 {{Left side of '||' is false}}
+  // expected-note at -4 {{Assuming 'index' is <= 10}}
+  // expected-note at -5 {{Taking false branch}}
+  return &array[index];
+  // expected-warning at -1 {{Potential out of bound access to 'array' with tainted index}}
+  // expected-note at -2 {{Access of 'array' with a tainted index that may be too large}}
+}
+
 void taintedOffset(void) {
   int index;
   scanf("%d", &index);
@@ -53,11 +74,13 @@ void flippedOverflow(void) {
 }
 
 int *afterTheEndPtr(void) {
-  // FIXME: this is an unusual but standard-compliant way of writing (array + 10).
-  // I think people who rely on this confusing corner case deserve a warning,
-  // but if this is widespread, then we could introduce a special case for it
-  // in the checker.
-  return &array[10];
+  // This is an unusual but standard-compliant way of writing (array + 10).
+  return &array[10]; // no-warning
+}
+
+int useAfterTheEndPtr(void) {
+  // ... but dereferencing the after-the-end pointer is still invalid.
+  return *afterTheEndPtr();
   // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
   // expected-note at -2 {{Access of 'array' at index 10, while it holds only 10 'int' elements}}
 }
@@ -69,6 +92,22 @@ int *afterAfterTheEndPtr(void) {
   // expected-note at -2 {{Access of 'array' at index 11, while it holds only 10 'int' elements}}
 }
 
+int *potentialAfterTheEndPtr(int idx) {
+  if (idx < 10) { /* ...do something... */ }
+  // expected-note at -1 {{Assuming 'idx' is >= 10}}
+  // expected-note at -2 {{Taking false branch}}
+  return &array[idx];
+  // expected-warning at -1 {{Out of bound access to memory after the end of 'array'}}
+  // expected-note at -2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
+  // NOTE: On the idx >= 10 branch the normal "optimistic" behavior would've
+  // been continuing with the assumption that idx == 10 and the return value is
+  // a legitimate after-the-end pointer. The checker deviates from this by
+  // reporting an error because this situation is very suspicious and far from
+  // the idiomatic `&array[size]` expressions. If the report is FP, the
+  // developer can easily silence it by writing array+idx instead of
+  // &array[idx].
+}
+
 int scalar;
 int scalarOverflow(void) {
   return (&scalar)[1];

>From 4fb7355a1db34f3efdfac65d2f847de90f29960b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy at ericsson.com>
Date: Fri, 24 Nov 2023 19:01:56 +0100
Subject: [PATCH 5/5] Improve comment that explains the use of PostStmt
 callbacks

---
 .../StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp  | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 19b075ae39ac72d..9e9cc5799ec7d8a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -39,11 +39,13 @@ struct Messages {
   std::string Short, Full;
 };
 
-// NOTE: This is a `PostStmt` checker because the implementation passes the
-// whole subscript or unary operator expression to `CheckerContext::getSVal()`,
-// so those two callback wouldn't work as `PreStmt` (unless this checker
-// duplicated the code evaluating those expressions). The `MemberExpr` callback
-// would work as `PreStmt`, but it's `PostStmt` for the sake of consistency.
+// NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
+// instead of `PreStmt` because the current implementation passes the whole
+// expression to `CheckerContext::getSVal()` which only works after the
+// symbolic evaluation of the expression. (To turn them into `PreStmt`
+// callbacks, we'd need to duplicate the logic that evaluates these
+// expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
+// defined as `PostStmt` for the sake of consistency with the other callbacks.
 class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
                                            check::PostStmt<UnaryOperator>,
                                            check::PostStmt<MemberExpr>> {